コード例 #1
0
def main():
    floder = File_floder()
    floder.add(path_p=path)
    csv = Csv_base()
    csv.write_csv_file_line(
        file_path=path + "/data.csv",
        mode='w+',
        str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间'])
    # 爬虫
    start_url = "http://www.81.cn/rd/node_92585%s.htm"
    for i in range(1, 6):
        ss = ''
        if (i == 1):
            ss = ''
        else:
            ss = '_%d' % i
        url = start_url % (ss)
        #print(url)
        list_html = htmlSource.get_html(url_p=url, type_p='rg')
        #print(list_html)
        colum = [('a', '//div[@class="content"]//ul//li//a//@href', 'sab',
                  'http://www.81.cn/rd/')]
        list = rule.html_content_analysis_detial(html_text=list_html,
                                                 column=colum,
                                                 url=url)
        #print(list)
        for a in list[0][1]:
            read_detial(a, i)
コード例 #2
0
def main():
    floder = File_floder()
    floder.add(path_p=path)
    csv = Csv_base()
    csv.write_csv_file_line(
        file_path=path + "/data.csv",
        mode='w+',
        str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间'])
    # 爬虫
    start_url = "http://www.cankaoxiaoxi.com/mil/gjjq/%d.shtml"
    for i in range(1, 101):
        url = start_url % (i)
        #print(url)
        list_html = htmlSource.get_html(url_p=url, type_p='rg')
        #print(list_html)
        colum = [(
            'a',
            '//div[@class="inner"]//ul[@class="txt-list-a fz-14"]//li//a//@href',
            'sab', '')]
        list = rule.html_content_analysis_detial(html_text=list_html,
                                                 column=colum,
                                                 url=url)
        #print(list)
        for a in list[0][1]:
            read_detial(a, i)
コード例 #3
0
def main():
    floder = File_floder()
    floder.add(path_p=path)
    csv = Csv_base()
    csv.write_csv_file_line(
        file_path=path + "/data.csv",
        mode='w+',
        str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间'])
    # 爬虫
    start_url = "http://mil.huanqiu.com/world/%s.html"
    for i in range(1, 101):
        s = 'index'
        if (i == 1):
            s = 'index'
        else:
            s = i
        url = start_url % (s)
        #print(url)
        list_html = htmlSource.get_html(url_p=url, type_p='rg')
        #print(list_html)
        colum = [('a', '//div[@class="fallsFlow"]//ul//li//a//@href', 'l')]
        list = rule.html_content_analysis_detial(html_text=list_html,
                                                 column=colum,
                                                 url=url)
        #print(list)
        for a in list[0][1]:
            read_detial(a, i)
コード例 #4
0
ファイル: 5.1.py プロジェクト: aiyang-t/crawler
def main():
    floder = File_floder()
    floder.add(path_p=path)
    csv = Csv_base()
    csv.write_csv_file_line(
        file_path=path + "/data.csv",
        mode='w+',
        str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间'])
    # 爬虫
    start_url = "http://junshi.xilu.com/dfjs/index_1372_%d.html"
    for i in range(1, 101):
        url = start_url % (i)
        #print(url)
        list_html = htmlSource.get_html(url_p=url, type_p='rg')
        #print(list_html)
        colum = [(
            'a',
            '//div[@class="newslist_box"]//ul//li//div[@class="newslist_tit"]//a//@href',
            'l')]
        list = rule.html_content_analysis_detial(html_text=list_html,
                                                 column=colum,
                                                 url=url)
        #print(list)
        for a in list[0][1]:
            read_detial(a, i)
コード例 #5
0
ファイル: 3.2.py プロジェクト: aiyang-t/crawler
def main():
    floder = File_floder()
    floder.add(path_p=path)
    csv = Csv_base()
    csv.write_csv_file_line(
        file_path=path + "/data.csv",
        mode='w+',
        str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间'])
    # 爬虫
    start_url = "http://qc.wa.news.cn/nodeart/list?nid=11139636&pgnum=%d&cnt=1000&tp=1&orderby=1"
    for i in range(1, 6):
        url = start_url % (i)

        read_detial(url, i)
コード例 #6
0
ファイル: 7.1.py プロジェクト: aiyang-t/crawler
def main():
    floder = File_floder()
    floder.add(path_p=path)
    csv = Csv_base()
    csv.write_csv_file_line(file_path=path+"/data.csv",mode='w+',str=['网站名','网址','标题','新闻内容','发布时间','页码','采集时间'])
    # 爬虫
    start_url = "https://www.thepaper.cn/load_index.jsp?nodeids=25430&topCids=&pageidx=%d&isList=true&lastTime=1550211962471"
    for i in range(1,26):
        url = start_url%(i)
        #print(url)
        list_html = htmlSource.get_html(url_p=url,type_p='rg')
        #print(list_html)
        colum=[('a','//div[@class="news_li"]//h2//a//@href','sab','https://www.thepaper.cn/')]
        list = rule.html_content_analysis_detial(html_text=list_html,column=colum,url=url)
        #print(list)
        for a in list[0][1]:
            read_detial(a,i)
コード例 #7
0
def main():
    floder = File_floder()
    floder.add(path_p=path)
    csv = Csv_base()
    csv.write_csv_file_line(file_path=path + "/data.csv",
                            mode='w+',
                            str=['网站名', '网址', '标题', '新闻内容', '发布时间', '采集时间'])
    # 爬虫
    start_url = "https://military.china.com/news/"

    #print(url)
    list_html = htmlSource.get_html(url_p=start_url, type_p='rg')
    #print(list_html)
    colum = [('a', '//div[@class="column-list"]//h3[@class="tit"]//a//@href',
              'l')]
    list = rule.html_content_analysis_detial(html_text=list_html,
                                             column=colum,
                                             url=start_url)
    #print(list)
    for a in list[0][1]:
        read_detial(a)
コード例 #8
0
def main():
    floder = File_floder()
    floder.add(path_p=path)
    csv = Csv_base()
    csv.write_csv_file_line(
        file_path=path + "/data.csv",
        mode='w+',
        str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间'])
    # 爬虫
    start_url = "http://military.people.com.cn/GB/1077/index%d.html"
    for i in range(1, 8):
        url = start_url % (i)
        #print(url)
        list_html = htmlSource.get_html(url_p=url, type_p='rg')
        #print(list_html)
        colum = [('a', '//div[@class="ej_list_box clear"]//ul//li//a//@href',
                  'sab', 'http://military.people.com.cn')]
        list = rule.html_content_analysis_detial(html_text=list_html,
                                                 column=colum,
                                                 url=url)
        #print(list)
        for a in list[0][1]:
            read_detial(a, i)
コード例 #9
0
def main():
    floder = File_floder()
    floder.add(path_p=path)
    csv = Csv_base()
    csv.write_csv_file_line(
        file_path=path + "/data.csv",
        mode='w+',
        str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间'])
    # 爬虫
    start_url = "http://www.dsti.net/Information/HyeList/arms/%d"
    for i in range(1, 815):
        url = start_url % (i)
        #print(url)
        list_html = htmlSource.get_html(url_p=url, type_p='rg')
        #print(list_html)
        colum = [('a', '//div[@class="listMidContent"]//ul//li//a//@href',
                  'sab', 'http://www.dsti.net')]
        list = rule.html_content_analysis_detial(html_text=list_html,
                                                 column=colum,
                                                 url=url)
        #print(list)
        for a in list[0][1]:
            read_detial(a, i)
コード例 #10
0
ファイル: 1.2.py プロジェクト: aiyang-t/crawler
def main():
    floder = File_floder()
    floder.add(path_p=path)
    csv = Csv_base()
    csv.write_csv_file_line(
        file_path=path + "/data.csv",
        mode='w+',
        str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间'])
    # 爬虫
    start_url = "http://mil.news.sina.com.cn/roll/index.d.html?cid=57919&page=%d"
    for i in range(1, 26):
        url = start_url % (i)
        #print(url)
        list_html = htmlSource.get_html(url_p=url, type_p='rg')
        #print(list_html)
        colum = [('a', '//div[@class="fixList"]//ul//li//a//@href', 'l')]
        list = rule.html_content_analysis_detial(html_text=list_html,
                                                 column=colum,
                                                 url=url)
        #print(list)
        for a in list[0][1]:
            if (a[len(a) - 6:] == '.shtml'):
                read_detial(a, i)
コード例 #11
0
ファイル: xiachufang.py プロジェクト: aiyang-t/crawler
# -- coding: UTF-8 --

from common.HtmlSource import HtmlSource
from common.Rule import Rule
#from common.inc_conn import Conn_mysql
from common.inc_csv import Csv_base
from common.inc_file import File_file, File_floder
import requests
from lxml import html
import time

floder = File_floder()
htmlSource = HtmlSource()
rule = Rule()
csv = Csv_base()
flag = 0
commontitle = 1


# 多线程
def read_detial(url, path):
    if (str(url[0][1]).startswith("http") and url[1][1] != "收起"):
        # TODO 翻页
        for i in range(1, 11):
            print(url[0][1])
            detial_html = htmlSource.get_html(url_p=url[0][1] +
                                              "/?page=%d" % i,
                                              type_p='rg')
            tree = html.fromstring(detial_html)
            hreflist = tree.xpath(
                '//ul[@class="list"]/li/div/div/p[@class="name"]/a/@href')