Esempio n. 1
0
def read_detial(url, i):
    detial_html = htmlSource.get_html(url_p=url, type_p='rg')
    #print(detial_html)
    # 写html
    files = File_file()
    names = url.split('/')
    file_name = names[len(names) - 1]

    files.save_source(path=path,
                      file=file_name,
                      all_the_text=detial_html,
                      encoding_='utf-8')
    colum = [
        ('title', '//h1[@class="articleHead"]//text()', 'l'),
        ('pushDate',
         '//div[@class="info"]//span[@class="infoA"][@id="pubtime_baidu"]//text()',
         'l'), ('content', '//div[@class="articleText"]//text()', 'sarra', ',')
    ]
    result = rule.html_content_analysis_detial(html_text=detial_html,
                                               column=colum,
                                               url=url)
    print(result)
    #sql="insert into cancer value('%s','%s','%s','%s','%s')"%(result[0][1][0],str(result[1][1][0]).replace('患者,图片因隐私问题无法显示','').replace("患者,","患者:").replace("医生,","医生:").replace('\'','"'),type,'春雨医生',url)
    #print(sql)
    # 写文件
    # web_name(网站名)、web_url(网址)、titile(标题)、text(新闻内容)、publish_date(发布时间)
    csv = Csv_base()
    csv.write_csv_file_line(file_path=path + "/data.csv",
                            str=[
                                '参考消息', url, result[0][1], result[1][1],
                                result[2][1], i,
                                time.strftime('%Y-%m-%d %H:%M:%S',
                                              time.localtime(time.time()))
                            ])
Esempio n. 2
0
 def saveConf(self):
     # TODO 上传到服务器、保存数据到数据库提供监控查询点
     # 弹窗输入配置的名称,配置执行控制的条件、定时配置等等信息录入到数据库
     filename = "testxuexi111.json"
     # 保存配置
     file = File_file()
     file.save_source(path="./configs", file=filename, all_the_text=str(self.conf))
Esempio n. 3
0
 def crawlerDetail(self, confs, url=''):
     rule = Rule()
     result = rule.crawler_detail(confs=confs, url=url)
     file = File_file()
     # 写入数据
     print(result)
     file.save_source(path='../data/',
                      file='xuexi111Detail.json',
                      all_the_text=str(result) + '\n')
Esempio n. 4
0
def read_detial(url, i):
    detial_html = htmlSource.get_html(url_p=url, type_p='rg')
    #print(detial_html)
    # 写html
    files = File_file()
    file_name = "%d.json" % i

    files.save_source(path=path,
                      file=file_name,
                      all_the_text=detial_html,
                      encoding_='utf-8')
Esempio n. 5
0
# -- coding: UTF-8 --

import requests
from lxml import etree
from common.inc_file import File_file

file = File_file()
urlx = 'https://jieqi.supfree.net/cntv.asp?n='
session = requests.Session()

for year in range(833, 5001):
    dicall = {}
    url = urlx + str(year)
    resp = session.get(url=url)
    resp.encoding = 'gb2312'
    rep = etree.HTML(resp.text)
    a = rep.xpath('//table/tr/td/a/text()')
    b = rep.xpath('//table/tr/td/text()')
    dicall[str(year)] = {}
    for i in range(len(a)):
        if len(b) > i:
            dicall[str(year)][a[i]] = str(b[i]).strip()[:-9]
    file.save_source(path="./",
                     file="jieqi.json",
                     all_the_text=str(dicall) + "\n")