コード例 #1
0
def main():
    floder = File_floder()
    floder.add(path_p=path)
    csv = Csv_base()
    csv.write_csv_file_line(
        file_path=path + "/data.csv",
        mode='w+',
        str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间'])
    # 爬虫
    start_url = "http://www.cankaoxiaoxi.com/mil/gjjq/%d.shtml"
    for i in range(1, 101):
        url = start_url % (i)
        #print(url)
        list_html = htmlSource.get_html(url_p=url, type_p='rg')
        #print(list_html)
        colum = [(
            'a',
            '//div[@class="inner"]//ul[@class="txt-list-a fz-14"]//li//a//@href',
            'sab', '')]
        list = rule.html_content_analysis_detial(html_text=list_html,
                                                 column=colum,
                                                 url=url)
        #print(list)
        for a in list[0][1]:
            read_detial(a, i)
コード例 #2
0
def main():
    floder = File_floder()
    floder.add(path_p=path)
    csv = Csv_base()
    csv.write_csv_file_line(
        file_path=path + "/data.csv",
        mode='w+',
        str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间'])
    # 爬虫
    start_url = "http://mil.huanqiu.com/world/%s.html"
    for i in range(1, 101):
        s = 'index'
        if (i == 1):
            s = 'index'
        else:
            s = i
        url = start_url % (s)
        #print(url)
        list_html = htmlSource.get_html(url_p=url, type_p='rg')
        #print(list_html)
        colum = [('a', '//div[@class="fallsFlow"]//ul//li//a//@href', 'l')]
        list = rule.html_content_analysis_detial(html_text=list_html,
                                                 column=colum,
                                                 url=url)
        #print(list)
        for a in list[0][1]:
            read_detial(a, i)
コード例 #3
0
def read_detial(url, i):
    detial_html = htmlSource.get_html(url_p=url, type_p='rg')
    #print(detial_html)
    # 写html
    files = File_file()
    names = url.split('/')
    file_name = names[len(names) - 1]

    files.save_source(path=path,
                      file=file_name,
                      all_the_text=detial_html,
                      encoding_='utf-8')
    colum = [
        ('title', '//h1[@class="articleHead"]//text()', 'l'),
        ('pushDate',
         '//div[@class="info"]//span[@class="infoA"][@id="pubtime_baidu"]//text()',
         'l'), ('content', '//div[@class="articleText"]//text()', 'sarra', ',')
    ]
    result = rule.html_content_analysis_detial(html_text=detial_html,
                                               column=colum,
                                               url=url)
    print(result)
    #sql="insert into cancer value('%s','%s','%s','%s','%s')"%(result[0][1][0],str(result[1][1][0]).replace('患者,图片因隐私问题无法显示','').replace("患者,","患者:").replace("医生,","医生:").replace('\'','"'),type,'春雨医生',url)
    #print(sql)
    # 写文件
    # web_name(网站名)、web_url(网址)、titile(标题)、text(新闻内容)、publish_date(发布时间)
    csv = Csv_base()
    csv.write_csv_file_line(file_path=path + "/data.csv",
                            str=[
                                '参考消息', url, result[0][1], result[1][1],
                                result[2][1], i,
                                time.strftime('%Y-%m-%d %H:%M:%S',
                                              time.localtime(time.time()))
                            ])
コード例 #4
0
def main():
    floder = File_floder()
    floder.add(path_p=path)
    csv = Csv_base()
    csv.write_csv_file_line(
        file_path=path + "/data.csv",
        mode='w+',
        str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间'])
    # 爬虫
    start_url = "http://www.81.cn/rd/node_92585%s.htm"
    for i in range(1, 6):
        ss = ''
        if (i == 1):
            ss = ''
        else:
            ss = '_%d' % i
        url = start_url % (ss)
        #print(url)
        list_html = htmlSource.get_html(url_p=url, type_p='rg')
        #print(list_html)
        colum = [('a', '//div[@class="content"]//ul//li//a//@href', 'sab',
                  'http://www.81.cn/rd/')]
        list = rule.html_content_analysis_detial(html_text=list_html,
                                                 column=colum,
                                                 url=url)
        #print(list)
        for a in list[0][1]:
            read_detial(a, i)
コード例 #5
0
ファイル: 5.1.py プロジェクト: aiyang-t/crawler
def main():
    floder = File_floder()
    floder.add(path_p=path)
    csv = Csv_base()
    csv.write_csv_file_line(
        file_path=path + "/data.csv",
        mode='w+',
        str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间'])
    # 爬虫
    start_url = "http://junshi.xilu.com/dfjs/index_1372_%d.html"
    for i in range(1, 101):
        url = start_url % (i)
        #print(url)
        list_html = htmlSource.get_html(url_p=url, type_p='rg')
        #print(list_html)
        colum = [(
            'a',
            '//div[@class="newslist_box"]//ul//li//div[@class="newslist_tit"]//a//@href',
            'l')]
        list = rule.html_content_analysis_detial(html_text=list_html,
                                                 column=colum,
                                                 url=url)
        #print(list)
        for a in list[0][1]:
            read_detial(a, i)
コード例 #6
0
ファイル: 3.2.py プロジェクト: aiyang-t/crawler
def main():
    floder = File_floder()
    floder.add(path_p=path)
    csv = Csv_base()
    csv.write_csv_file_line(
        file_path=path + "/data.csv",
        mode='w+',
        str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间'])
    # 爬虫
    start_url = "http://qc.wa.news.cn/nodeart/list?nid=11139636&pgnum=%d&cnt=1000&tp=1&orderby=1"
    for i in range(1, 6):
        url = start_url % (i)

        read_detial(url, i)
コード例 #7
0
ファイル: 7.1.py プロジェクト: aiyang-t/crawler
def main():
    floder = File_floder()
    floder.add(path_p=path)
    csv = Csv_base()
    csv.write_csv_file_line(file_path=path+"/data.csv",mode='w+',str=['网站名','网址','标题','新闻内容','发布时间','页码','采集时间'])
    # 爬虫
    start_url = "https://www.thepaper.cn/load_index.jsp?nodeids=25430&topCids=&pageidx=%d&isList=true&lastTime=1550211962471"
    for i in range(1,26):
        url = start_url%(i)
        #print(url)
        list_html = htmlSource.get_html(url_p=url,type_p='rg')
        #print(list_html)
        colum=[('a','//div[@class="news_li"]//h2//a//@href','sab','https://www.thepaper.cn/')]
        list = rule.html_content_analysis_detial(html_text=list_html,column=colum,url=url)
        #print(list)
        for a in list[0][1]:
            read_detial(a,i)
コード例 #8
0
def main():
    floder = File_floder()
    floder.add(path_p=path)
    csv = Csv_base()
    csv.write_csv_file_line(file_path=path + "/data.csv",
                            mode='w+',
                            str=['网站名', '网址', '标题', '新闻内容', '发布时间', '采集时间'])
    # 爬虫
    start_url = "https://military.china.com/news/"

    #print(url)
    list_html = htmlSource.get_html(url_p=start_url, type_p='rg')
    #print(list_html)
    colum = [('a', '//div[@class="column-list"]//h3[@class="tit"]//a//@href',
              'l')]
    list = rule.html_content_analysis_detial(html_text=list_html,
                                             column=colum,
                                             url=start_url)
    #print(list)
    for a in list[0][1]:
        read_detial(a)
コード例 #9
0
def main():
    floder = File_floder()
    floder.add(path_p=path)
    csv = Csv_base()
    csv.write_csv_file_line(
        file_path=path + "/data.csv",
        mode='w+',
        str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间'])
    # 爬虫
    start_url = "http://military.people.com.cn/GB/1077/index%d.html"
    for i in range(1, 8):
        url = start_url % (i)
        #print(url)
        list_html = htmlSource.get_html(url_p=url, type_p='rg')
        #print(list_html)
        colum = [('a', '//div[@class="ej_list_box clear"]//ul//li//a//@href',
                  'sab', 'http://military.people.com.cn')]
        list = rule.html_content_analysis_detial(html_text=list_html,
                                                 column=colum,
                                                 url=url)
        #print(list)
        for a in list[0][1]:
            read_detial(a, i)
コード例 #10
0
def main():
    floder = File_floder()
    floder.add(path_p=path)
    csv = Csv_base()
    csv.write_csv_file_line(
        file_path=path + "/data.csv",
        mode='w+',
        str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间'])
    # 爬虫
    start_url = "http://www.dsti.net/Information/HyeList/arms/%d"
    for i in range(1, 815):
        url = start_url % (i)
        #print(url)
        list_html = htmlSource.get_html(url_p=url, type_p='rg')
        #print(list_html)
        colum = [('a', '//div[@class="listMidContent"]//ul//li//a//@href',
                  'sab', 'http://www.dsti.net')]
        list = rule.html_content_analysis_detial(html_text=list_html,
                                                 column=colum,
                                                 url=url)
        #print(list)
        for a in list[0][1]:
            read_detial(a, i)
コード例 #11
0
ファイル: 1.2.py プロジェクト: aiyang-t/crawler
def main():
    floder = File_floder()
    floder.add(path_p=path)
    csv = Csv_base()
    csv.write_csv_file_line(
        file_path=path + "/data.csv",
        mode='w+',
        str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间'])
    # 爬虫
    start_url = "http://mil.news.sina.com.cn/roll/index.d.html?cid=57919&page=%d"
    for i in range(1, 26):
        url = start_url % (i)
        #print(url)
        list_html = htmlSource.get_html(url_p=url, type_p='rg')
        #print(list_html)
        colum = [('a', '//div[@class="fixList"]//ul//li//a//@href', 'l')]
        list = rule.html_content_analysis_detial(html_text=list_html,
                                                 column=colum,
                                                 url=url)
        #print(list)
        for a in list[0][1]:
            if (a[len(a) - 6:] == '.shtml'):
                read_detial(a, i)
コード例 #12
0
from common.inc_csv import Csv_base
from lxml import html
import re


def readFile(filePath=''):
    pass


if __name__ == '__main__':
    filepath = '../data/问答语料_1.0.txt'
    file = Csv_base()
    list = file.read_csv_file(filepath)
    for i in range(len(list)):
        if (i > 3):
            row = list[i]
            rows = str(row[0]).split("\t")
            html_text = rows[1].replace("[", '').replace("]", "")
            # 正则匹配 re.match从字符串起始处匹配。
            html_text = re.sub(re.compile(r"<script.*?</script>", re.S), "",
                               html_text)

            print(html_text)
            tree = html.fromstring(html_text)
            texts = tree.xpath('.//text()')
            text = ""
            for a in texts:
                text = text + str(a).replace("\\n", ".").strip()
            row_content = [rows[0], text]
            file.write_csv_file_line(file_path="../data/问答语料_1.0.csv",
                                     str=row_content)
コード例 #13
0
if __name__ == '__main__':

    csv_data_path = "../../data/百科候选关键词.csv"
    rows = csv.read_csv_file(csv_data_path)
    for row in rows:
        try:
            html_data_path = str(row[11]).replace("`", "")
            #print("../"+html_data_path)
            html_context = file.open_source2(file_path="../" + html_data_path +
                                             ".html")

            # 正则匹配 re.match从字符串起始处匹配。
            html_text = re.sub(re.compile(r"<script.*?</script>", re.S), "",
                               html_context)
            tree = html.fromstring(html_text)
            texts = tree.xpath('.//div[@class="main-content"]//text()')
            text = ""
            for a in texts:
                text = text + str(a).replace("\\n", " ").strip()
            text = replaceStr(text)
            #print(text)

            data_str = [html_data_path, text]
            csv.write_csv_file_line(file_path="../data/clean百科候选关键词.csv",
                                    str=data_str)
        except FileNotFoundError as notfile:
            csv.write_csv_file_line("../data/nofile.csv", str=row)
        except Exception as e:

            print(e)
コード例 #14
0
# -- coding: UTF-8 --

from common.inc_csv import Csv_base
from common.inc_file import File_floder
import requests
if __name__ == '__main__':
    file_path = '../data/百科候选关键词.csv'
    folder_path ="../data/百科候选关键词/img"
    floder = File_floder()
    floder.add(folder_path)

    file = Csv_base()
    list = file.read_csv_file(file_path)
    for row in list:
        try:
            img_url = str(row[4]).replace('`','')
            if(img_url!=''):
                img_name=img_url.split("/")[-1]
                if(img_name.find("?")>-1):
                    img_name = img_name[0:img_name.find("?")]
                #img_content = requests.get(img_url).content
                #with open('../data/百科候选关键词/img/%s.jpg' % img_name, 'wb') as f:
                #    f.write(img_content)
                rw_str=[img_url,img_name,row[11].replace('`','')]
                file.write_csv_file_line(file_path="../data/百科候选关键词_img.csv",str=rw_str)
        except Exception as e:
            print(e)