Esempio n. 1
0
def main():
    floder = File_floder()
    floder.add(path_p=path)
    csv = Csv_base()
    csv.write_csv_file_line(
        file_path=path + "/data.csv",
        mode='w+',
        str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间'])
    # 爬虫
    start_url = "http://www.81.cn/rd/node_92585%s.htm"
    for i in range(1, 6):
        ss = ''
        if (i == 1):
            ss = ''
        else:
            ss = '_%d' % i
        url = start_url % (ss)
        #print(url)
        list_html = htmlSource.get_html(url_p=url, type_p='rg')
        #print(list_html)
        colum = [('a', '//div[@class="content"]//ul//li//a//@href', 'sab',
                  'http://www.81.cn/rd/')]
        list = rule.html_content_analysis_detial(html_text=list_html,
                                                 column=colum,
                                                 url=url)
        #print(list)
        for a in list[0][1]:
            read_detial(a, i)
Esempio n. 2
0
def main():
    floder = File_floder()
    floder.add(path_p=path)
    csv = Csv_base()
    csv.write_csv_file_line(
        file_path=path + "/data.csv",
        mode='w+',
        str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间'])
    # 爬虫
    start_url = "http://www.cankaoxiaoxi.com/mil/gjjq/%d.shtml"
    for i in range(1, 101):
        url = start_url % (i)
        #print(url)
        list_html = htmlSource.get_html(url_p=url, type_p='rg')
        #print(list_html)
        colum = [(
            'a',
            '//div[@class="inner"]//ul[@class="txt-list-a fz-14"]//li//a//@href',
            'sab', '')]
        list = rule.html_content_analysis_detial(html_text=list_html,
                                                 column=colum,
                                                 url=url)
        #print(list)
        for a in list[0][1]:
            read_detial(a, i)
Esempio n. 3
0
def main():
    floder = File_floder()
    floder.add(path_p=path)
    csv = Csv_base()
    csv.write_csv_file_line(
        file_path=path + "/data.csv",
        mode='w+',
        str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间'])
    # 爬虫
    start_url = "http://mil.huanqiu.com/world/%s.html"
    for i in range(1, 101):
        s = 'index'
        if (i == 1):
            s = 'index'
        else:
            s = i
        url = start_url % (s)
        #print(url)
        list_html = htmlSource.get_html(url_p=url, type_p='rg')
        #print(list_html)
        colum = [('a', '//div[@class="fallsFlow"]//ul//li//a//@href', 'l')]
        list = rule.html_content_analysis_detial(html_text=list_html,
                                                 column=colum,
                                                 url=url)
        #print(list)
        for a in list[0][1]:
            read_detial(a, i)
Esempio n. 4
0
def main():
    floder = File_floder()
    floder.add(path_p=path)
    csv = Csv_base()
    csv.write_csv_file_line(
        file_path=path + "/data.csv",
        mode='w+',
        str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间'])
    # 爬虫
    start_url = "http://junshi.xilu.com/dfjs/index_1372_%d.html"
    for i in range(1, 101):
        url = start_url % (i)
        #print(url)
        list_html = htmlSource.get_html(url_p=url, type_p='rg')
        #print(list_html)
        colum = [(
            'a',
            '//div[@class="newslist_box"]//ul//li//div[@class="newslist_tit"]//a//@href',
            'l')]
        list = rule.html_content_analysis_detial(html_text=list_html,
                                                 column=colum,
                                                 url=url)
        #print(list)
        for a in list[0][1]:
            read_detial(a, i)
Esempio n. 5
0
def main():
    floder = File_floder()
    floder.add(path_p=path)
    csv = Csv_base()
    csv.write_csv_file_line(
        file_path=path + "/data.csv",
        mode='w+',
        str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间'])
    # 爬虫
    start_url = "http://qc.wa.news.cn/nodeart/list?nid=11139636&pgnum=%d&cnt=1000&tp=1&orderby=1"
    for i in range(1, 6):
        url = start_url % (i)

        read_detial(url, i)
Esempio n. 6
0
def main():
    floder = File_floder()
    floder.add(path_p=path)
    csv = Csv_base()
    csv.write_csv_file_line(file_path=path+"/data.csv",mode='w+',str=['网站名','网址','标题','新闻内容','发布时间','页码','采集时间'])
    # 爬虫
    start_url = "https://www.thepaper.cn/load_index.jsp?nodeids=25430&topCids=&pageidx=%d&isList=true&lastTime=1550211962471"
    for i in range(1,26):
        url = start_url%(i)
        #print(url)
        list_html = htmlSource.get_html(url_p=url,type_p='rg')
        #print(list_html)
        colum=[('a','//div[@class="news_li"]//h2//a//@href','sab','https://www.thepaper.cn/')]
        list = rule.html_content_analysis_detial(html_text=list_html,column=colum,url=url)
        #print(list)
        for a in list[0][1]:
            read_detial(a,i)
Esempio n. 7
0
def main():
    floder = File_floder()
    floder.add(path_p=path)
    csv = Csv_base()
    csv.write_csv_file_line(file_path=path + "/data.csv",
                            mode='w+',
                            str=['网站名', '网址', '标题', '新闻内容', '发布时间', '采集时间'])
    # 爬虫
    start_url = "https://military.china.com/news/"

    #print(url)
    list_html = htmlSource.get_html(url_p=start_url, type_p='rg')
    #print(list_html)
    colum = [('a', '//div[@class="column-list"]//h3[@class="tit"]//a//@href',
              'l')]
    list = rule.html_content_analysis_detial(html_text=list_html,
                                             column=colum,
                                             url=start_url)
    #print(list)
    for a in list[0][1]:
        read_detial(a)
Esempio n. 8
0
def main():
    floder = File_floder()
    floder.add(path_p=path)
    csv = Csv_base()
    csv.write_csv_file_line(
        file_path=path + "/data.csv",
        mode='w+',
        str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间'])
    # 爬虫
    start_url = "http://military.people.com.cn/GB/1077/index%d.html"
    for i in range(1, 8):
        url = start_url % (i)
        #print(url)
        list_html = htmlSource.get_html(url_p=url, type_p='rg')
        #print(list_html)
        colum = [('a', '//div[@class="ej_list_box clear"]//ul//li//a//@href',
                  'sab', 'http://military.people.com.cn')]
        list = rule.html_content_analysis_detial(html_text=list_html,
                                                 column=colum,
                                                 url=url)
        #print(list)
        for a in list[0][1]:
            read_detial(a, i)
Esempio n. 9
0
def main():
    floder = File_floder()
    floder.add(path_p=path)
    csv = Csv_base()
    csv.write_csv_file_line(
        file_path=path + "/data.csv",
        mode='w+',
        str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间'])
    # 爬虫
    start_url = "http://www.dsti.net/Information/HyeList/arms/%d"
    for i in range(1, 815):
        url = start_url % (i)
        #print(url)
        list_html = htmlSource.get_html(url_p=url, type_p='rg')
        #print(list_html)
        colum = [('a', '//div[@class="listMidContent"]//ul//li//a//@href',
                  'sab', 'http://www.dsti.net')]
        list = rule.html_content_analysis_detial(html_text=list_html,
                                                 column=colum,
                                                 url=url)
        #print(list)
        for a in list[0][1]:
            read_detial(a, i)
Esempio n. 10
0
def main():
    floder = File_floder()
    floder.add(path_p=path)
    csv = Csv_base()
    csv.write_csv_file_line(
        file_path=path + "/data.csv",
        mode='w+',
        str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间'])
    # 爬虫
    start_url = "http://mil.news.sina.com.cn/roll/index.d.html?cid=57919&page=%d"
    for i in range(1, 26):
        url = start_url % (i)
        #print(url)
        list_html = htmlSource.get_html(url_p=url, type_p='rg')
        #print(list_html)
        colum = [('a', '//div[@class="fixList"]//ul//li//a//@href', 'l')]
        list = rule.html_content_analysis_detial(html_text=list_html,
                                                 column=colum,
                                                 url=url)
        #print(list)
        for a in list[0][1]:
            if (a[len(a) - 6:] == '.shtml'):
                read_detial(a, i)
Esempio n. 11
0
# -- coding: UTF-8 --

from common.inc_csv import Csv_base
from common.inc_file import File_floder
import requests
if __name__ == '__main__':
    file_path = '../data/百科候选关键词.csv'
    folder_path ="../data/百科候选关键词/img"
    floder = File_floder()
    floder.add(folder_path)

    file = Csv_base()
    list = file.read_csv_file(file_path)
    for row in list:
        try:
            img_url = str(row[4]).replace('`','')
            if(img_url!=''):
                img_name=img_url.split("/")[-1]
                if(img_name.find("?")>-1):
                    img_name = img_name[0:img_name.find("?")]
                #img_content = requests.get(img_url).content
                #with open('../data/百科候选关键词/img/%s.jpg' % img_name, 'wb') as f:
                #    f.write(img_content)
                rw_str=[img_url,img_name,row[11].replace('`','')]
                file.write_csv_file_line(file_path="../data/百科候选关键词_img.csv",str=rw_str)
        except Exception as e:
            print(e)