def download(reponse): html = reponse.text pattern = re.compile('<div class="control clear data-bottom-id">[\s\S]*?<!----></div>') texts = re.findall(pattern, html) for htmlObject in texts: html = etree.HTML(htmlObject + "</div>") titles = html.xpath('//div[@class="content"]/a[1]/text()')[0].split() title = "" for i in titles: title += i + " " if title_find(title, come_from="zaoping_alerts"): return True print("zaoping_alerts") data = html.xpath('//div[@class="content"]/a[2]/text()')[0].split() text = "" for i in data: text += i + " " timeout = html.xpath('//div[@class="time"]/text()')[0].split() if len(timeout) == 1: date = UTCTime(timeout) else: date = "" for i in timeout: date += i + " " storage(title, date, text)
def divide(i): try: print("tuoniaox_alerts") # 将从网页中用正则匹配到的局部html内容,变为xml格式的文档 html = etree.HTML(i) # 获取实时的发布时间 timeout = html.xpath('//span/text()')[0] # 获取信息的内容 texts = html.xpath('//p/text()')[0].split() text = "" for i in texts: text += i + " " # 分离出标题以及其发布的时间日期 pattern = re.compile("【[\s\S]*?】") title = re.findall(pattern, text)[0] # 由于没有明确的id值 if title_find(title, come_from="tuoniaox_alerts"): return pattern = re.compile("\d月\d日") accurate = re.findall(pattern, text) if accurate: accurate = accurate[0] else: accurate = "" author = "鸵鸟区块链:https://www.tuoniaox.com/" # 判断是否是该网站的原创作 source = html.xpath('//a/@href') if source: source = "负责编译--原文:" + source else: source = "鸵鸟区块链--快讯" storage(title, author, timeout, accurate, source, text) except Exception as err: mistake(url="https://www.tuoniaox.com/", err=err)
def download(object): texts = etree.tostring(object, method="text", encoding="utf8").decode("utf8").split() timeout = texts[0] data = texts[1:-3] text = "" for i in data: text += i + " " pattern = re.compile('(【[\s\S]*?】)([\s\S]*。)') title_text = re.findall(pattern, text) title = title_text[0][0] if title_find(title, come_from="btc798_alerts"): return True text = title_text[0][1] storage(title, timeout, text)
def download(text, reload): # 将分割后的数据再进行分割获取数据 title = text[0] if title == "马云:不支持比特币": title = text[0] + " " + text[1] main = text[2:-2] else: main = text[1:-2] if title_find(title, come_from="gongxiangcj_alerts"): if reload == 3: return True else: reload += 1 return timeout = text[-2] + " " + text[-1] storage(title, timeout, main)
def download(reponse, url): try: print("bibaodao_alerts") html = etree.HTML(reponse.text) texts = html.xpath( '/html/body/div/div[3]/div[2]/div/div/div/div[1]/div[2]/div') for text in texts: data = etree.tostring(text, method="text", encoding="utf8").decode("utf8").split() timeout = data[0] + " " + data[1] mains = data[2:] main = "" for i in mains: main += i + " " pattern = re.compile("【[\s\S]*?】") title = re.findall(pattern, main)[0] if title_find(title, come_from="bibaodao_alerts"): return True storage(title, timeout, main) except Exception as err: mistake(url, err)
def download(reponse): print("ihuoqiu_alerts") html = etree.HTML(reponse.text) date = time.asctime(time.localtime(time.time())).split() texts = html.xpath('//*[@id="panel1"]/div') for i in texts: text = etree.tostring(i, method="text", encoding="utf8").decode("utf8").split() # 将时间调整好 date[-2] = text[0] timeout = "" for da in date: timeout += da + " " title = text[1] if title_find(title, come_from="ihuoqiu_alerts"): break # 将正文组合好 if text[-2] == "[查看原文]": main = text[2:-2] else: main = text[2:-1] storage(title, timeout, main)