def main(): floder = File_floder() floder.add(path_p=path) csv = Csv_base() csv.write_csv_file_line( file_path=path + "/data.csv", mode='w+', str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间']) # 爬虫 start_url = "http://www.81.cn/rd/node_92585%s.htm" for i in range(1, 6): ss = '' if (i == 1): ss = '' else: ss = '_%d' % i url = start_url % (ss) #print(url) list_html = htmlSource.get_html(url_p=url, type_p='rg') #print(list_html) colum = [('a', '//div[@class="content"]//ul//li//a//@href', 'sab', 'http://www.81.cn/rd/')] list = rule.html_content_analysis_detial(html_text=list_html, column=colum, url=url) #print(list) for a in list[0][1]: read_detial(a, i)
def main(): floder = File_floder() floder.add(path_p=path) csv = Csv_base() csv.write_csv_file_line( file_path=path + "/data.csv", mode='w+', str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间']) # 爬虫 start_url = "http://www.cankaoxiaoxi.com/mil/gjjq/%d.shtml" for i in range(1, 101): url = start_url % (i) #print(url) list_html = htmlSource.get_html(url_p=url, type_p='rg') #print(list_html) colum = [( 'a', '//div[@class="inner"]//ul[@class="txt-list-a fz-14"]//li//a//@href', 'sab', '')] list = rule.html_content_analysis_detial(html_text=list_html, column=colum, url=url) #print(list) for a in list[0][1]: read_detial(a, i)
def main(): floder = File_floder() floder.add(path_p=path) csv = Csv_base() csv.write_csv_file_line( file_path=path + "/data.csv", mode='w+', str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间']) # 爬虫 start_url = "http://mil.huanqiu.com/world/%s.html" for i in range(1, 101): s = 'index' if (i == 1): s = 'index' else: s = i url = start_url % (s) #print(url) list_html = htmlSource.get_html(url_p=url, type_p='rg') #print(list_html) colum = [('a', '//div[@class="fallsFlow"]//ul//li//a//@href', 'l')] list = rule.html_content_analysis_detial(html_text=list_html, column=colum, url=url) #print(list) for a in list[0][1]: read_detial(a, i)
def main(): floder = File_floder() floder.add(path_p=path) csv = Csv_base() csv.write_csv_file_line( file_path=path + "/data.csv", mode='w+', str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间']) # 爬虫 start_url = "http://junshi.xilu.com/dfjs/index_1372_%d.html" for i in range(1, 101): url = start_url % (i) #print(url) list_html = htmlSource.get_html(url_p=url, type_p='rg') #print(list_html) colum = [( 'a', '//div[@class="newslist_box"]//ul//li//div[@class="newslist_tit"]//a//@href', 'l')] list = rule.html_content_analysis_detial(html_text=list_html, column=colum, url=url) #print(list) for a in list[0][1]: read_detial(a, i)
def main(): floder = File_floder() floder.add(path_p=path) csv = Csv_base() csv.write_csv_file_line( file_path=path + "/data.csv", mode='w+', str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间']) # 爬虫 start_url = "http://qc.wa.news.cn/nodeart/list?nid=11139636&pgnum=%d&cnt=1000&tp=1&orderby=1" for i in range(1, 6): url = start_url % (i) read_detial(url, i)
def main(): floder = File_floder() floder.add(path_p=path) csv = Csv_base() csv.write_csv_file_line(file_path=path+"/data.csv",mode='w+',str=['网站名','网址','标题','新闻内容','发布时间','页码','采集时间']) # 爬虫 start_url = "https://www.thepaper.cn/load_index.jsp?nodeids=25430&topCids=&pageidx=%d&isList=true&lastTime=1550211962471" for i in range(1,26): url = start_url%(i) #print(url) list_html = htmlSource.get_html(url_p=url,type_p='rg') #print(list_html) colum=[('a','//div[@class="news_li"]//h2//a//@href','sab','https://www.thepaper.cn/')] list = rule.html_content_analysis_detial(html_text=list_html,column=colum,url=url) #print(list) for a in list[0][1]: read_detial(a,i)
def main(): floder = File_floder() floder.add(path_p=path) csv = Csv_base() csv.write_csv_file_line(file_path=path + "/data.csv", mode='w+', str=['网站名', '网址', '标题', '新闻内容', '发布时间', '采集时间']) # 爬虫 start_url = "https://military.china.com/news/" #print(url) list_html = htmlSource.get_html(url_p=start_url, type_p='rg') #print(list_html) colum = [('a', '//div[@class="column-list"]//h3[@class="tit"]//a//@href', 'l')] list = rule.html_content_analysis_detial(html_text=list_html, column=colum, url=start_url) #print(list) for a in list[0][1]: read_detial(a)
def main(): floder = File_floder() floder.add(path_p=path) csv = Csv_base() csv.write_csv_file_line( file_path=path + "/data.csv", mode='w+', str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间']) # 爬虫 start_url = "http://military.people.com.cn/GB/1077/index%d.html" for i in range(1, 8): url = start_url % (i) #print(url) list_html = htmlSource.get_html(url_p=url, type_p='rg') #print(list_html) colum = [('a', '//div[@class="ej_list_box clear"]//ul//li//a//@href', 'sab', 'http://military.people.com.cn')] list = rule.html_content_analysis_detial(html_text=list_html, column=colum, url=url) #print(list) for a in list[0][1]: read_detial(a, i)
def main(): floder = File_floder() floder.add(path_p=path) csv = Csv_base() csv.write_csv_file_line( file_path=path + "/data.csv", mode='w+', str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间']) # 爬虫 start_url = "http://www.dsti.net/Information/HyeList/arms/%d" for i in range(1, 815): url = start_url % (i) #print(url) list_html = htmlSource.get_html(url_p=url, type_p='rg') #print(list_html) colum = [('a', '//div[@class="listMidContent"]//ul//li//a//@href', 'sab', 'http://www.dsti.net')] list = rule.html_content_analysis_detial(html_text=list_html, column=colum, url=url) #print(list) for a in list[0][1]: read_detial(a, i)
def main(): floder = File_floder() floder.add(path_p=path) csv = Csv_base() csv.write_csv_file_line( file_path=path + "/data.csv", mode='w+', str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间']) # 爬虫 start_url = "http://mil.news.sina.com.cn/roll/index.d.html?cid=57919&page=%d" for i in range(1, 26): url = start_url % (i) #print(url) list_html = htmlSource.get_html(url_p=url, type_p='rg') #print(list_html) colum = [('a', '//div[@class="fixList"]//ul//li//a//@href', 'l')] list = rule.html_content_analysis_detial(html_text=list_html, column=colum, url=url) #print(list) for a in list[0][1]: if (a[len(a) - 6:] == '.shtml'): read_detial(a, i)
# -- coding: UTF-8 -- from common.inc_csv import Csv_base from common.inc_file import File_floder import requests if __name__ == '__main__': file_path = '../data/百科候选关键词.csv' folder_path ="../data/百科候选关键词/img" floder = File_floder() floder.add(folder_path) file = Csv_base() list = file.read_csv_file(file_path) for row in list: try: img_url = str(row[4]).replace('`','') if(img_url!=''): img_name=img_url.split("/")[-1] if(img_name.find("?")>-1): img_name = img_name[0:img_name.find("?")] #img_content = requests.get(img_url).content #with open('../data/百科候选关键词/img/%s.jpg' % img_name, 'wb') as f: # f.write(img_content) rw_str=[img_url,img_name,row[11].replace('`','')] file.write_csv_file_line(file_path="../data/百科候选关键词_img.csv",str=rw_str) except Exception as e: print(e)