def saveConf(self): # TODO 上传到服务器、保存数据到数据库提供监控查询点 # 弹窗输入配置的名称,配置执行控制的条件、定时配置等等信息录入到数据库 filename = "testxuexi111.json" # 保存配置 file = File_file() file.save_source(path="./configs", file=filename, all_the_text=str(self.conf))
def read_detial(url, i): detial_html = htmlSource.get_html(url_p=url, type_p='rg') #print(detial_html) # 写html files = File_file() names = url.split('/') file_name = names[len(names) - 1] files.save_source(path=path, file=file_name, all_the_text=detial_html, encoding_='utf-8') colum = [ ('title', '//h1[@class="articleHead"]//text()', 'l'), ('pushDate', '//div[@class="info"]//span[@class="infoA"][@id="pubtime_baidu"]//text()', 'l'), ('content', '//div[@class="articleText"]//text()', 'sarra', ',') ] result = rule.html_content_analysis_detial(html_text=detial_html, column=colum, url=url) print(result) #sql="insert into cancer value('%s','%s','%s','%s','%s')"%(result[0][1][0],str(result[1][1][0]).replace('患者,图片因隐私问题无法显示','').replace("患者,","患者:").replace("医生,","医生:").replace('\'','"'),type,'春雨医生',url) #print(sql) # 写文件 # web_name(网站名)、web_url(网址)、titile(标题)、text(新闻内容)、publish_date(发布时间) csv = Csv_base() csv.write_csv_file_line(file_path=path + "/data.csv", str=[ '参考消息', url, result[0][1], result[1][1], result[2][1], i, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) ])
def crawlerDetail(self, confs, url=''): rule = Rule() result = rule.crawler_detail(confs=confs, url=url) file = File_file() # 写入数据 print(result) file.save_source(path='../data/', file='xuexi111Detail.json', all_the_text=str(result) + '\n')
def read_detial(url, i): detial_html = htmlSource.get_html(url_p=url, type_p='rg') #print(detial_html) # 写html files = File_file() file_name = "%d.json" % i files.save_source(path=path, file=file_name, all_the_text=detial_html, encoding_='utf-8')
#!/usr/bin/env python # coding=utf-8 from common.inc_csv import Csv_base from common.inc_file import File_file from lxml import html import re csv = Csv_base() file = File_file() def replaceStr(a): print(a) a = re.sub(re.compile(r"收藏查看我的收藏(\d+)有用(.*?)(\d+)已投票(\d+)", re.S), "", a) a = str(a).replace("编辑锁定", " ").strip() a = str(a).replace("讨论999", " ").strip() a = str(a).replace("本词条缺少概述图,补充相关内容使词条更完整,还能快速升级,赶紧来编辑吧!", " ").strip() a = str(a).replace( "百度百科内容由网友共同编辑,如您发现自己的词条内容不准确或不完善,欢迎使用本人词条编辑服务(免费)参与修正。", " ").strip() a = str(a).replace("立即前往 >>", " ").strip() print(a) return a if __name__ == '__main__': csv_data_path = "../../data/百科候选关键词.csv" rows = csv.read_csv_file(csv_data_path) for row in rows: try:
# -- coding: UTF-8 -- # 导入测试实例文件数据用, from common.inc_csv import Csv_base # DateUtils用日期计算工具 from datetime import datetime, timedelta, date,timezone from time import time, ctime, localtime, strftime, strptime, mktime import re,json,calendar # 寿星天文历 农历转公历 公历转农历 #import sxtwl from common.inc_file import File_file csv = Csv_base() file = File_file() class DateUtils: cn_num = { '〇': '0', '一': '1', '二': '2', '三': '3', '四': '4', '五': '5', '六': '6', '七': '7', '八': '8', '九': '9', '零': '0', '壹': '1', '贰': '2', '叁': '3', '肆': '4', '伍': '5', '陆': '6', '柒': '7', '捌': '8', '玖': '9', '貮': '2', '两': '2', '俩': '2', '十': '', } cn_date_day = { "大前天": -3, "大后天": 3, "明天": 1, "后天": 2, "昨天": -1, "今天": 0, "前天": -2,
# -- coding: UTF-8 -- import requests from lxml import etree from common.inc_file import File_file file = File_file() urlx = 'https://jieqi.supfree.net/cntv.asp?n=' session = requests.Session() for year in range(833, 5001): dicall = {} url = urlx + str(year) resp = session.get(url=url) resp.encoding = 'gb2312' rep = etree.HTML(resp.text) a = rep.xpath('//table/tr/td/a/text()') b = rep.xpath('//table/tr/td/text()') dicall[str(year)] = {} for i in range(len(a)): if len(b) > i: dicall[str(year)][a[i]] = str(b[i]).strip()[:-9] file.save_source(path="./", file="jieqi.json", all_the_text=str(dicall) + "\n")