def main(): floder = File_floder() floder.add(path_p=path) csv = Csv_base() csv.write_csv_file_line( file_path=path + "/data.csv", mode='w+', str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间']) # 爬虫 start_url = "http://www.81.cn/rd/node_92585%s.htm" for i in range(1, 6): ss = '' if (i == 1): ss = '' else: ss = '_%d' % i url = start_url % (ss) #print(url) list_html = htmlSource.get_html(url_p=url, type_p='rg') #print(list_html) colum = [('a', '//div[@class="content"]//ul//li//a//@href', 'sab', 'http://www.81.cn/rd/')] list = rule.html_content_analysis_detial(html_text=list_html, column=colum, url=url) #print(list) for a in list[0][1]: read_detial(a, i)
def main(): floder = File_floder() floder.add(path_p=path) csv = Csv_base() csv.write_csv_file_line( file_path=path + "/data.csv", mode='w+', str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间']) # 爬虫 start_url = "http://www.cankaoxiaoxi.com/mil/gjjq/%d.shtml" for i in range(1, 101): url = start_url % (i) #print(url) list_html = htmlSource.get_html(url_p=url, type_p='rg') #print(list_html) colum = [( 'a', '//div[@class="inner"]//ul[@class="txt-list-a fz-14"]//li//a//@href', 'sab', '')] list = rule.html_content_analysis_detial(html_text=list_html, column=colum, url=url) #print(list) for a in list[0][1]: read_detial(a, i)
def main(): floder = File_floder() floder.add(path_p=path) csv = Csv_base() csv.write_csv_file_line( file_path=path + "/data.csv", mode='w+', str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间']) # 爬虫 start_url = "http://mil.huanqiu.com/world/%s.html" for i in range(1, 101): s = 'index' if (i == 1): s = 'index' else: s = i url = start_url % (s) #print(url) list_html = htmlSource.get_html(url_p=url, type_p='rg') #print(list_html) colum = [('a', '//div[@class="fallsFlow"]//ul//li//a//@href', 'l')] list = rule.html_content_analysis_detial(html_text=list_html, column=colum, url=url) #print(list) for a in list[0][1]: read_detial(a, i)
def main(): floder = File_floder() floder.add(path_p=path) csv = Csv_base() csv.write_csv_file_line( file_path=path + "/data.csv", mode='w+', str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间']) # 爬虫 start_url = "http://junshi.xilu.com/dfjs/index_1372_%d.html" for i in range(1, 101): url = start_url % (i) #print(url) list_html = htmlSource.get_html(url_p=url, type_p='rg') #print(list_html) colum = [( 'a', '//div[@class="newslist_box"]//ul//li//div[@class="newslist_tit"]//a//@href', 'l')] list = rule.html_content_analysis_detial(html_text=list_html, column=colum, url=url) #print(list) for a in list[0][1]: read_detial(a, i)
def main(): floder = File_floder() floder.add(path_p=path) csv = Csv_base() csv.write_csv_file_line( file_path=path + "/data.csv", mode='w+', str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间']) # 爬虫 start_url = "http://qc.wa.news.cn/nodeart/list?nid=11139636&pgnum=%d&cnt=1000&tp=1&orderby=1" for i in range(1, 6): url = start_url % (i) read_detial(url, i)
def main(): floder = File_floder() floder.add(path_p=path) csv = Csv_base() csv.write_csv_file_line(file_path=path+"/data.csv",mode='w+',str=['网站名','网址','标题','新闻内容','发布时间','页码','采集时间']) # 爬虫 start_url = "https://www.thepaper.cn/load_index.jsp?nodeids=25430&topCids=&pageidx=%d&isList=true&lastTime=1550211962471" for i in range(1,26): url = start_url%(i) #print(url) list_html = htmlSource.get_html(url_p=url,type_p='rg') #print(list_html) colum=[('a','//div[@class="news_li"]//h2//a//@href','sab','https://www.thepaper.cn/')] list = rule.html_content_analysis_detial(html_text=list_html,column=colum,url=url) #print(list) for a in list[0][1]: read_detial(a,i)
def main(): floder = File_floder() floder.add(path_p=path) csv = Csv_base() csv.write_csv_file_line(file_path=path + "/data.csv", mode='w+', str=['网站名', '网址', '标题', '新闻内容', '发布时间', '采集时间']) # 爬虫 start_url = "https://military.china.com/news/" #print(url) list_html = htmlSource.get_html(url_p=start_url, type_p='rg') #print(list_html) colum = [('a', '//div[@class="column-list"]//h3[@class="tit"]//a//@href', 'l')] list = rule.html_content_analysis_detial(html_text=list_html, column=colum, url=start_url) #print(list) for a in list[0][1]: read_detial(a)
def main(): floder = File_floder() floder.add(path_p=path) csv = Csv_base() csv.write_csv_file_line( file_path=path + "/data.csv", mode='w+', str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间']) # 爬虫 start_url = "http://military.people.com.cn/GB/1077/index%d.html" for i in range(1, 8): url = start_url % (i) #print(url) list_html = htmlSource.get_html(url_p=url, type_p='rg') #print(list_html) colum = [('a', '//div[@class="ej_list_box clear"]//ul//li//a//@href', 'sab', 'http://military.people.com.cn')] list = rule.html_content_analysis_detial(html_text=list_html, column=colum, url=url) #print(list) for a in list[0][1]: read_detial(a, i)
def main(): floder = File_floder() floder.add(path_p=path) csv = Csv_base() csv.write_csv_file_line( file_path=path + "/data.csv", mode='w+', str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间']) # 爬虫 start_url = "http://www.dsti.net/Information/HyeList/arms/%d" for i in range(1, 815): url = start_url % (i) #print(url) list_html = htmlSource.get_html(url_p=url, type_p='rg') #print(list_html) colum = [('a', '//div[@class="listMidContent"]//ul//li//a//@href', 'sab', 'http://www.dsti.net')] list = rule.html_content_analysis_detial(html_text=list_html, column=colum, url=url) #print(list) for a in list[0][1]: read_detial(a, i)
def main(): floder = File_floder() floder.add(path_p=path) csv = Csv_base() csv.write_csv_file_line( file_path=path + "/data.csv", mode='w+', str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间']) # 爬虫 start_url = "http://mil.news.sina.com.cn/roll/index.d.html?cid=57919&page=%d" for i in range(1, 26): url = start_url % (i) #print(url) list_html = htmlSource.get_html(url_p=url, type_p='rg') #print(list_html) colum = [('a', '//div[@class="fixList"]//ul//li//a//@href', 'l')] list = rule.html_content_analysis_detial(html_text=list_html, column=colum, url=url) #print(list) for a in list[0][1]: if (a[len(a) - 6:] == '.shtml'): read_detial(a, i)
# -- coding: UTF-8 -- from common.HtmlSource import HtmlSource from common.Rule import Rule #from common.inc_conn import Conn_mysql from common.inc_csv import Csv_base from common.inc_file import File_file, File_floder import requests from lxml import html import time floder = File_floder() htmlSource = HtmlSource() rule = Rule() csv = Csv_base() flag = 0 commontitle = 1 # 多线程 def read_detial(url, path): if (str(url[0][1]).startswith("http") and url[1][1] != "收起"): # TODO 翻页 for i in range(1, 11): print(url[0][1]) detial_html = htmlSource.get_html(url_p=url[0][1] + "/?page=%d" % i, type_p='rg') tree = html.fromstring(detial_html) hreflist = tree.xpath( '//ul[@class="list"]/li/div/div/p[@class="name"]/a/@href')