def parse_juchao(response, item): PUB_URL = 'http://www.cninfo.com.cn/cninfo-new/disclosure/szse/bulletin_detail/true/' D_URL = 'http://www.cninfo.com.cn/cninfo-new/disclosure/szse/download/' dates = response.text json_str = json.loads(dates, encoding='utf-8') urls = jsonpath.jsonpath(json_str, "$..announcementId") title1 = jsonpath.jsonpath(json_str, "$..secCode") title2 = jsonpath.jsonpath(json_str, "$..secName") title3 = jsonpath.jsonpath(json_str, "$..announcementTitle") timestamp = jsonpath.jsonpath(json_str, "$..announcementTime") pdf = jsonpath.jsonpath(json_str, "$..adjunctUrl") if title2 is None: title2 = '' titles = zip(title1, title3) else: titles = zip(title1, title2, title3) url_contents = zip(urls, titles, timestamp, pdf) for url, title, time_local, pdf in url_contents: # item = {} if None in title: title = title[0] + title[2] else: title = title title = ' '.join(list(title)).replace('*', '').replace('/', '').replace('<', '').replace('>', '') \ .replace('|', '').replace(':', '').replace('"', '').replace('?', '') \ .replace('?', '') durl = D_URL + url # PDF文件下载地址 if pdf[-4:] == '.PDF': # print("==================================\n{}".format(durl)) result = session.query(NewsItemInfo).filter_by(url=PUB_URL + url, web_id=56).count() if result: # print("PDF 文件地址: {} 存在".format(PUB_URL + url)) pass else: contents = pdf_to_txt.main(url=durl, fileName=title) if len(contents) == 0: item['content'] = '请点击原文链接查看' else: item['content'] = '\n'.join(list(contents)) times = str(time_local)[0:-3] + '.' + '000' item['pub_time'] = datetime.datetime.fromtimestamp( float(times)).strftime('%Y-%m-%d %H:%M:%S') item['webname'] = '巨潮资讯' item['web'] = response.url[0:-7] item['add_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item["keyword"] = keyword.get_keyword(item["content"]) item['web_id'] = 56 item['title'] = title item['url'] = PUB_URL + url yield item
def parse(self, response): titles = response.xpath( "//div[contains(@class,'item-download-v2')]/span[@class='txt-blue']/text()" ).getall() urls = response.xpath( "//div[@class='flt-l-wrapper-v2']//a/@href").getall() dates = response.xpath("//div[@class='date']/span/text()").getall() dabao = zip(urls, titles, dates) for url, title, time in dabao: if 'http' in url: url = url else: url = url.replace('../', '') url = 'http://www.chinabondconnect.com/' + url # print(url) if url[-4:] == '.pdf': result = session.query(NewsItemInfo).filter_by( url=url, web_id=78).count() if result: # print("PDF 文件地址: {} 存在".format(url)) pass else: item = FagaiweiItem() item['webname'] = '债券通' item['web'] = response.url title = ''.join(list(title)).replace('*', '').replace('/', '').replace('<', '').replace('>', '') \ .replace('|', '').replace(':', '').replace('"', '').replace('?', '') \ .replace('?', '') time = time.replace('.', '-') item['pub_time'] = datetime.strptime(time, '%d-%m-%Y') item['url'] = url item['title'] = title content = pdf.main(url=url, fileName=title) if content == '': item['content'] = '请点击原文链接查看' + response.url else: item['content'] = ''.join(list(content)) # item['add_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item["keyword"] = keyword.get_keyword(item["content"]) item['web_id'] = 78 # print(item) yield item else: pass
def parse(self, response): titles = response.xpath("//dd/em/a/text()").getall() urls = response.xpath("//dd/a/@href").getall() dates = response.xpath("//dd/span/text()").getall() # print(titles,urls,dates) dabao = zip(urls, titles, dates) for url, title, time in dabao: if url[-4:] == '.pdf': result = session.query(NewsItemInfo).filter_by( url=url, web_id=55).count() if result: # print("PDF 文件地址: {} 存在".format(url)) pass else: item = FagaiweiItem() item['webname'] = '上海证券交易所' item['web'] = response.url title = ''.join(list(title)).replace('*', '').replace('/', '').replace('<', '').replace('>', '') \ .replace('|', '').replace(':', '').replace('"', '').replace('?', '') \ .replace('?', '').replace('\t', '') time = time.replace('\r\n\r\n\r\n', '').strip() item[ 'pub_time'] = time # datetime.strptime(time, '%Y-%m-%d') item['url'] = url item['title'] = title content = pdf.main(url=url, fileName=title) # if len(content) == 0: # item['content'] = '请打开原文链接查看' # else : # item['content'] = content if len(content) == 0: item['content'] = '请点击原文链接查看' + response.url else: item['content'] = ''.join(list(content)) # item['add_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item["keyword"] = keyword.get_keyword(item["content"]) item['web_id'] = 55 # print(item) yield item else: pass
def parse(self, response): item = FagaiweiItem() urls = response.xpath( "//ul[@class='gg-list']/li/span[@class='tit']/a/@href").getall() titles1 = response.xpath( "//ul[@class='gg-list']/li/span[@class='tit']/a/text()").getall() titles2 = response.xpath( "//ul[@class='gg-list']/li/span[@class='code']/a/text()").getall() times = response.xpath( "//ul[@class='gg-list']/li/span[@class='time']/text()").getall() dabao = zip(urls, titles1, titles2, times) for url, title1, title2, time in dabao: title = title2 + ' ' + title1 title = ''.join(list(title)).replace('*', '').replace('/', '').replace('<', '').replace('>', '') \ .replace('|', '').replace(':', '').replace('"', '').replace('?', '') \ .replace('?', '') shijian, filename = re.findall(r'=(\d{8})(\w+)', url)[0] url2 = 'http://php.cnstock.com/texts/2018/' + shijian + '/' + filename + '.pdf' durl = url2 # PDF文件下载地址 if durl[-4:] == '.pdf': # print("==================================\n{}".format(durl)) result = session.query(NewsItemInfo).filter_by( url=url2, web_id=67).count() if result: # print("PDF 文件地址: {} 存在".format(url2)) pass else: content = pdf.main(url=url2, fileName=title) if len(content) == 0: item['content'] = '请点击原文链接查看' else: item['content'] = ''.join(list(content)) item['web_id'] = 67 item['title'] = title time = time.replace('(', '').replace(')', '') item['pub_time'] = time item['webname'] = '中国证券网信息披露平台' item['web'] = response.url item['url'] = url2 item["keyword"] = keyword.get_keyword(item["content"]) yield item
def parse(self, response): titles = response.xpath( "//div[contains(@class,'title')]/a/text()").getall() urls = response.xpath( "//div[contains(@class,'items-col')]/a/@href").getall() dates = response.xpath( "//div[@class='items']/div[contains(@class,'date')]/text()" ).getall() dabao = zip(urls, titles, dates) # print(len(urls), len(dates)) for url, title, time in dabao: if url[-4:] == '.pdf': result = session.query(NewsItemInfo).filter_by( url=url, web_id=79).count() if result: # print("PDF 文件地址: {} 存在".format(url)) pass else: item = FagaiweiItem() item['webname'] = '中华交易服务' item['web'] = response.url title = ''.join(list(title)).replace('*', '').replace('/', '').replace('<', '').replace('>', '') \ .replace('|', '').replace(':', '').replace('"', '').replace('?', '') \ .replace('?', '').replace('\t', '') time = time.replace('/', '-') item['pub_time'] = datetime.strptime(time, '%d-%m-%Y') item['url'] = url item['title'] = title content = pdf.main(url=url, fileName=title) if content == '': item['content'] = '请点击原文链接查看' + response.url else: item['content'] = ''.join(list(content)) # item['add_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item["keyword"] = keyword.get_keyword(item["content"]) item['web_id'] = 79 # print(item) yield item else: pass
def parse(self, response): info_list = response.xpath( '//body/table[2]/tr[3]/td/table/tr[contains(@class,"row")]') for info in info_list: item = FagaiweiItem() url = 'http://www.hkexnews.hk' + info.xpath( './td[4]/a/@href').extract_first(default='') # print(url) if url[-4:] == '.pdf': result = session.query(NewsItemInfo).filter_by( url=url, web_id=80).count() if result: # print("PDF 文件地址: {} 存在".format(url)) pass else: item['url'] = url title = info.xpath('./td[3]/nobr/text()').extract_first( ) + ':' + info.xpath('./td[4]/div/text()').extract_first( default='').strip() title = ''.join(list(title)).replace('*', '').replace('/', '').replace('<', '').replace('>', '') \ .replace('|', '').replace(':', '').replace('"', '').replace('?', '') \ .replace('?', '') item['title'] = title item['web'] = response.url item['webname'] = '披漏网' time = ' '.join( info.xpath('./td[1]/text()').extract()).replace( '/', '-') item['pub_time'] = datetime.strptime( time, '%d-%m-%Y %H:%M') content = pdf.main(url=url, fileName=title) if len(content) == 0: item['content'] = '这可能是图片或者文件,打开查看!' else: item['content'] = ''.join(list(content)) item['web_id'] = 80 item["keyword"] = keyword.get_keyword(item["content"]) yield item