def parse_item(self, response): if response.status == 200: try: data = json.loads('['+response.body.decode()+']') print(data) except Exception as e: print(e) item = ErrorItem() item['code'] = response.status # 出错的页面 item['url'] = 901 # 出错的时间 # item['timestamp'] = time.time() # 出错的url item['site'] = "httpbin" # 出错的描述 item['desc'] = '响应的json数据错误' # 代码报出的错误 item['exception'] = str(e) return item else: try: for item in data: i = NewsItem() i['source'] = "httpbin" # print(item) i['pubDate'] = "" i['title'] = "" i['content'] = item['origin'] yield i except Exception as e: item = ErrorItem() item['code'] = response.status # 出错的页面 item['url'] = 902 # 出错的时间 # item['timestamp'] = time.time() # 出错的url item['site'] = "httpbin" # 出错的描述 item['desc'] = '解析json数据错误' # 代码报出的错误 item['exception'] = str(e) yield item else: item = ErrorItem() item['code'] = response.status # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的url item['site'] = "httpbin" # 出错的描述 item['desc'] = '响应错误' # 代码报出的错误 item['exception'] = '' yield item
def parse(self, response): item = NewsItem() movieList = response.xpath( '//div[@class="col01L"]/div[@class="box_02"]/ul/li') for movie in movieList: link = movie.xpath('.//a/@href').extract_first() item["link"] = link yield Request(link, callback=self.parse2, meta=item)
def parse_item(self, response): if response.status == 200: try: data = json.loads(response.body.decode()) except Exception as e: print(e) item = ErrorItem() item['code'] = response.status # 出错的页面 item['url'] = 901 # 出错的时间 # item['timestamp'] = time.time() # 出错的url item['site'] = "e公司" # 出错的描述 item['desc'] = '响应的json数据错误' # 代码报出的错误 item['exception'] = str(e) return item else: try: for item in data['data']: i = NewsItem() i['source'] = "egs" # print(item) i['pubDate'] = item.get('pageTime', "") i['title'] = item.get('title', "") i['content'] = item.get('content', "") i['isRed'] = item.get('isRed', 0) yield i except Exception as e: item = ErrorItem() item['code'] = response.status # 出错的页面 item['url'] = 902 # 出错的时间 # item['timestamp'] = time.time() # 出错的url item['site'] = "e公司" # 出错的描述 item['desc'] = '解析json数据错误' # 代码报出的错误 item['exception'] = str(e) yield item else: item = ErrorItem() item['code'] = response.status # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的url item['site'] = "e公司" # 出错的描述 item['desc'] = '响应错误' # 代码报出的错误 item['exception'] = '' yield item
def parse_comment(self, response): result = json.loads(response.text) item = NewsItem() item['source'] = response.meta['source'] item['date'] = response.meta['date'] item['newsId'] = response.meta['newsId'] item['url'] = response.meta['url'] item['title'] = response.meta['title'] item['contents'] = response.meta['contents'] item['comments'] = result['cmtAgainst'] + result['cmtVote'] + result['rcount'] item['time'] = response.meta['time'] return item
def parse_content(self, response): try: soup = BeautifulSoup(response.body) date = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime( time.mktime( time.strptime(response.meta['publish_time'], "%Y-%m-%d %H:%M:%S")))) # 终止条件 interval = tools.time_cmp(float(self.scan_id), date) if interval > self.days: print('______________过时新闻________________'.encode( "utf-8").decode(self.decoding)) return title = soup.find('div', attrs={ 'class': 'LEFT' }).find('h1').get_text() hot_degree = int(response.meta['comment_num']) keywords = ' '.join(response.meta['keywords']) # 删除div节点 soup.find('div', attrs={ 'class': 'content-article' }).find('div').decompose() article = [] for p in soup.find('div', attrs={ 'class': 'content-article' }).find_all('p'): if p.get_text() is not None: article.append(p.get_text().strip()) article = '\n'.join(article) abstract = tools.tencent_keyword_abstract(article, 4) # 封装成item similar_list = self.s.cal_similarities(article) print('腾讯网: '.encode("utf-8").decode(self.decoding), title.encode("utf-8").decode(self.decoding).strip()) if max(similar_list) > self.threshold: item = NewsItem() item['title'] = title.strip() item['url'] = response.url.strip() item['net_name'] = '腾讯' item['ent_time'] = date item['keyword'] = keywords.strip() item['digest'] = abstract.strip() item['content'] = article.strip() item['hot_degree'] = str( tools.divide_hot_degree(self.name, hot_degree)) item['scan_id'] = str(self.scan_id) return item except: pass
def parse_page(self, response): if response.status == 200: lis = response.xpath('/html/body/div/div/ul/li') if lis is None or len(lis) == 0: item = ErrorItem() item['code'] = 801 # 出错的页面 item['url'] = response.url # 出错的时间 item['date'] = time.time() # 出错的网站 item['site'] = "中证网" # 出错的描述 item['desc'] = '未找到html元素' # 代码报出的错误 item['exception'] = '' return item try: for li in lis: item = NewsItem() item['source'] = "cs" # temp = li.xpath('./span/text()').get().strip() # 19-05-16 18:43 # temp = '20' + temp # d = datetime.datetime.strptime(temp, "%Y-%m-%d %H:%M") # t = d.timetuple() # timeStamp = int(time.mktime(t)) # # item['pubDate'] = timeStamp item['pubDate'] ='' item['title'] = li.xpath('./a/text()').get() url = r'http://www.cs.com.cn/sylm/jsbd/' + li.xpath('./a/@href').get() yield scrapy.Request(url=url, meta={'item': item}, callback=self.parse_item, dont_filter=True) except Exception as e: item = ErrorItem() item['code'] = 802 # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的网站 item['site'] = "中证网" # 出错的描述 item['desc'] = '解析html元素错误' # 代码报出的错误 item['exception'] = str(e) yield item
def parse_comment(self, response): if re.findall(r'"total":(\d*)\,', response.text): comments = re.findall(r'"total":(\d*)\,', response.text)[0] else: comments = 0 item = NewsItem() item['source'] = response.meta['source'] item['time'] = response.meta['time'] item['date'] = response.meta['date'] item['contents'] = response.meta['contents'] item['title'] = response.meta['title'] item['url'] = response.meta['url'] item['newsId'] = response.meta['newsId'] item['comments'] = comments return item
def parse_news(self, response): sel = Selector(response) if sel.xpath('//*[@id="Main-Article-QQ"]/div/div[1]/div[1]/div[1]/h1/text()'): title = sel.xpath('//*[@id="Main-Article-QQ"]/div/div[1]/div[1]/div[1]/h1/text()').extract()[0] elif sel.xpath('//*[@id="C-Main-Article-QQ"]/div/div[1]/div[1]/div[1]/h1/text()'): title = sel.xpath('//*[@id="C-Main-Article-QQ"]/div/div[1]/div[1]/div[1]/h1/text()').extract()[0] elif sel.xpath('//*[@id="ArticleTit"]/text()'): title = sel.xpath('//*[@id="ArticleTit"]/text()').extract()[0] else: title = 'unknown' pattern = re.match(self.url_pattern, str(response.url)) source = 'tencent' date = pattern.group(2) date = date[0:4] + '/' + date[4:6] + '/' + date[6:] newsId = pattern.group(3) url = response.url if sel.xpath('//*[@id="Main-Article-QQ"]/div/div[1]/div[1]/div[1]/div/div[1]/span[3]/text()'): time_ = sel.xpath('//*[@id="Main-Article-QQ"]/div/div[1]/div[1]/div[1]/div/div[1]/span[3]/text()').extract()[0] else: time_ = 'unknown' contents = ListCombiner(sel.xpath('//p/text()').extract()[:-8]) if response.xpath('//*[@id="Main-Article-QQ"]/div/div[1]/div[2]/script[2]/text()'): cmt = response.xpath('//*[@id="Main-Article-QQ"]/div/div[1]/div[2]/script[2]/text()').extract()[0] if re.findall(r'cmt_id = (\d*);', cmt): cmt_id = re.findall(r'cmt_id = (\d*);', cmt)[0] comment_url = 'http://coral.qq.com/article/{}/comment?commentid=0&reqnum=1&tag=&callback=mainComment&_=1389623278900'.format(cmt_id) yield Request(comment_url, self.parse_comment, meta={'source': source, 'date': date, 'newsId': newsId, 'url': url, 'title': title, 'contents': contents, 'time': time_ }) else: item = NewsItem() item['source'] = source item['time'] = time_ item['date'] = date item['contents'] = contents item['title'] = title item['url'] = url item['newsId'] = newsId item['comments'] = 0 return item
def parse2(self, response): item2 = NewsItem() item = response.meta content = response.xpath('//div[@id="artical"]') #http://finance.ifeng.com/a/20180806/16429540_0.shtml if 'finance.ifeng.com/a' not in item['link'] or '.shtml' not in item[ 'link']: return # 时间 判断20分钟内的新闻不保存 publishTime = content[0].xpath( './/span[@itemprop="datePublished"]/text()').extract_first() if publishTime < ( (datetime.datetime.now() - datetime.timedelta(minutes=19)).strftime('%Y-%m-%d %H:%M:%S')): return item2["createTime"] = publishTime #title, artical, origin, link # 来源 origin = content[0].xpath( './/span[@itemprop="publisher"]/span/a/text()').extract_first() if origin is None: origin = content[0].xpath( './/span[@itemprop="publisher"]/span/text()').extract_first() item2["source"] = origin #标题 title = content[0].xpath( './/h1[@itemprop="headline"]/text()').extract_first() item2["title"] = title #内容 artical = content[0].xpath('.//div[@id="artical_real"]').xpath( 'string(.)').extract_first() item2["content"] = artical.replace('\r\n', '').replace(' ', '').replace('\n', '')[0:200] item2["link"] = item["link"] item2["uuid"] = uuid.uuid1() yield item2
def news_parser(news_config: dict, response: Response): title = _extract([XPathFirst(xpath_str) for xpath_str in news_config['title_xpath_list']], response) author = _extract([XPathFirst(xpath_str) for xpath_str in news_config['author_xpath_list']], response) publish_time = _extract([XPathFirst(xpath_str) for xpath_str in news_config['publish_time_xpath_list']], response) rich_content_origin = _extract([XPathFirst(xpath_str) for xpath_str in news_config['content_xpath_list']], response) if not (title and publish_time and rich_content_origin): logging.debug(f'[NOT NEWS]<title>:{title}, ' f'<publish_time>:{publish_time} ' f'<content_text>:{bool(rich_content_origin)}' f'<url>:{response.url}') return False, None cleaned_content, cleaned_content_text = clean_html_content_text(rich_content_origin) logging.debug(f'[IS NEWS]<title>:{title}, ' f'<publish_time>:{publish_time} ' f'<content_text>:{cleaned_content_text}' f'<url>:{response.url}') return True, NewsItem( source=news_config['source'], title=title, author=author, publish_time=publish_time, content=cleaned_content_text, rich_content=cleaned_content, )
def parse_detail(self, response): newsitem = NewsItem() selector = Selector(response) current_url = response.url #获取当前链接 # print(current_url) url = response.meta['url'] cate = response.meta['cate'] source = response.meta['source'] tag = response.meta['tag'] tags = tag.split(':') xpath_rule = './/div[@' + tags[0] + '=' + "'" + tags[1] + "'" + ']/p' res = selector.xpath(xpath_rule) content = '' for c in res: content += c.xpath('string(.)').extract_first() + '\r\n' content = re.sub('[\u3000 \xa0 \\t \u200b ■]+', '', content) content = re.sub(r'showPlayer.*?;', '', content) #过滤人民网内特殊字符 content = '\r\n'.join([ c.replace('\n', '') for c in content.split('\r\n') if c.strip() and len(c.strip()) > 20 ]) if content: newsitem['url'] = current_url newsitem['content'] = content newsitem['source'] = source newsitem['category'] = cate yield newsitem else: yield scrapy.Request(url=current_url, meta={ 'url': url, 'cate': cate, 'source': source, 'tag': tag }, callback=self.parse)
def parse_item(self, response): if response.status == 200: lis = response.xpath('//ul[@class="nf-list"]/li') if lis is None or len(lis) == 0: item = ErrorItem() item['code'] = 801 # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的网站 item['site'] = "上证快讯" # 出错的描述 item['desc'] = '未找到html元素' # 代码报出的错误 item['exception'] = '' return item try: # 日期:2019年05月16日 # riqi = response.xpath('//div[@class="nf-head"]/p/text()').get().strip() for li in lis: item = NewsItem() item['source'] = "cnstock" # temp = li.xpath('./p[1]/text()').get() # 如:20:30 # temp = riqi + temp # 2019年05月16日20:30 # # d = datetime.datetime.strptime(temp, "%Y年%m月%d日%H:%M") # t = d.timetuple() # timeStamp = int(time.mktime(t)) # # item['pubDate'] = timeStamp item['pubDate'] = '' title_conent = li.xpath('./p[2]/a/text()').get() # 如: ''' 【压垮乐视网的最后一根稻草竟然是它!】15日,进入暂停上市状态第三天的乐视网披露,因乐视体育经营不利导致增资协议中的对赌条款失败,乐视体育股东之一的前海思拓提出的涉及回购融资股权的仲裁申请,得到了北京仲裁委员会的支持。 ''' item['title'] = (re.findall('【.*】', title_conent)[0]).replace('【', '').replace('】', '') item['content'] = re.findall('】.*', title_conent)[0].replace('】', '') item['isRed'] = 0 yield item except Exception as e: item = ErrorItem() item['code'] = 802 # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的网站 item['site'] = "上证快讯" # 出错的描述 item['desc'] = '解析html标签错误' # 代码报出的错误 item['exception'] = str(e) yield item else: item = ErrorItem() item['code'] = response.status # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的网站 item['site'] = "上证快讯" # 出错的描述 item['desc'] = '响应错误' # 代码报出的错误 item['exception'] = '' yield item
def parse_item(self, response): if response.status == 200: try: data_list = json.loads(response.body.decode()) # print(data_list) except Exception as e: print(e) item = ErrorItem() item['code'] = 800 # 出错的页面 item['url'] = response.url # 出错的时间 item['timestamp'] = time.time() # 出错的url item['site'] = "第一财经" # 出错的描述 item['desc'] = '响应的json数据错误' # 代码报出的错误 item['exception'] = str(e) yield item else: try: for data in data_list: item = NewsItem() item['source'] = 'yicai' date = data['datekey'] + " " + data[ 'hm'] # 如:2019.05.16 20:43 # print('<<<<<<<<<<< ' + temp + ' >>>>>>>>>>>') d = datetime.datetime.strptime(date, "%Y.%m.%d %H:%M") t = d.timetuple() item['pubDate'] = int(time.mktime(t)) # print(item) title_conent = data['newcontent'] # 如:【传化智联:非公开发行股票方案到期失效】 传化智联5月16日晚间公告,公司于2017年度股东大会审议通过《关于公司非公开发行股票方案的议案》,因资本市场环境变化等因素,公司此次非公开发行股票事项尚未取得实质进展。目前,此次非公开发行股票方案到期自动失效。 ", item['title'] = re.findall('【.*】', title_conent)[0].replace( '【', '').replace('】', '') item['content'] = re.findall('】.*', title_conent)[0].replace( '】', '') # TODO item['isRed'] = 0 yield item except Exception as e: item = ErrorItem() item['code'] = 902 # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的url item['site'] = "第一财经" # 出错的描述 item['desc'] = '解析json数据错误' # 代码报出的错误 item['exception'] = str(e) yield item else: item = ErrorItem() item['code'] = response.status # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的url item['site'] = "第一财经" # 出错的描述 item['desc'] = '响应错误' # 代码报出的错误 item['exception'] = '' yield item
def parse_content(self, response): try: soup = BeautifulSoup(response.body) # 获取时间 date = response.meta['date'] # 终止条件 interval = tools.time_cmp( float(self.scan_id), time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime( time.mktime( time.strptime(date, "%Y-%m-%dT%H:%M:%S+08:00"))))) if interval > self.days: print('______________过时新闻________________'.encode( "utf-8").decode(self.decoding)) return # 获取标题 title = soup.find('div', attrs={ 'class': 'mobile_article' }).find('h1').get_text() # 获取文章 textblock = soup.find('section', attrs={'class': 'textblock'}) try: if textblock.find('p').get_text().strip().startswith('编者'): textblock.find('p').find('p').decompose() except: pass article = [] for p in textblock.find_all('p'): if p.get_text() is not None: article.append(p.get_text().strip()) article = '\n'.join(article) # 获取总结 summary = soup.find('section', attrs={ 'class': 'summary' }).get_text().strip() # 获取点赞数 hot_degree = int( soup.find('b', attrs={ 'class': 'count-min' }).get_text().strip()) # 获取关键词和摘要 keywords, abstract = tools._36r_keyword_abstract(article, 3, 3) raw_keywords = [] for item in soup.find_all('a', attrs={'class': 'kr-tag-gray'}): raw_keywords.append(item.get_text()) if len(raw_keywords) != 0: keywords = raw_keywords keywords = ' '.join(keywords) print('36氪: '.encode("utf-8").decode(self.decoding), title.encode("utf-8").decode(self.decoding).strip()) # 封装成item similar_list = self.s.cal_similarities(article) if max(similar_list) > self.threshold: item = NewsItem() item['title'] = title.strip() item['url'] = response.url.strip() item['net_name'] = '36氪' item['ent_time'] = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime( time.mktime( time.strptime(date, "%Y-%m-%dT%H:%M:%S+08:00")))) item['keyword'] = keywords.strip() item['digest'] = abstract.strip() item['content'] = article.strip() item['hot_degree'] = str( tools.divide_hot_degree(self.name, hot_degree)) item['scan_id'] = str(self.scan_id) return item except: pass
def parse_content(self, response): soup = BeautifulSoup(response.body) # 获取新闻发布时间 date = soup.select('td[class="time"]')[0].get_text().strip() date = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(time.mktime(time.strptime(date, '%Y-%m-%d %H:%M')))) # 终止条件 interval = tools.time_cmp(float(self.scan_id), date) if interval > self.days: print('______________过时新闻________________'.encode("utf-8").decode( self.decoding)) return # 获取新闻标题 title = soup.select('h1[class="headTit"]')[0].get_text().strip() # 获取新闻导语 leadword = soup.select( 'div[class="article-lead"]')[0].get_text().strip() # 获取收藏数 hot_degree = int( soup.find('a', attrs={ 'class': 'collect collect-no' }).find('span').get_text().strip()) # 获取新闻URL url = response.url # 获取关键词 keywords = [] try: for i in soup.find('div', attrs={ 'class': 'related-link clr' }).children: keywords.append(i.string.strip()) except: pass # 获取新闻内容 comView = soup.select('div[class="lph-article-comView"]')[0] # 删除模版和JS try: [s.extract() for s in comView(['script', 'strong'])] except AttributeError: pass article = [] for p in comView.find_all('p'): if p.get_text() is not None: article.append(p.get_text().strip()) article = '\n'.join(article) temp_keywords, abstract = tools.leiphone_keyword_abstract( article, 3, 3) if len(keywords) == 0: keywords = temp_keywords keywords = ' '.join(keywords) print('雷锋网: '.encode("utf-8").decode(self.decoding), title.encode("utf-8").decode(self.decoding).strip()) # 封装成item similar_list = self.s.cal_similarities(article) if max(similar_list) > self.threshold: item = NewsItem() item['ent_time'] = date item['title'] = title.strip() item['url'] = url.strip() item['net_name'] = '雷锋网' item['keyword'] = keywords.strip() item['digest'] = abstract.strip() item['content'] = article.strip() item['hot_degree'] = str( tools.divide_hot_degree(self.name, hot_degree)) item['scan_id'] = self.scan_id return item
def parse_item(self, response): if response.status == 200: try: # 不是格式正确的json,一前一后需要加上'[' ']', data_list = json.loads('[' + response.body.decode() + ']') # print(data_list) except Exception as e: # print(e) item = ErrorItem() item['code'] = 901 # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的url item['site'] = "选股宝" # 出错的描述 item['desc'] = '响应的json数据错误' # 代码报出的错误 item['exception'] = str(e) yield item else: try: for data in data_list[0]["NewMsgs"]: item = NewsItem() # item['flag'] = 1 item['source'] = 'xuangubao' item['pubDate'] = data['UpdatedAtInSec'] item['title'] = data['Title'] item['content'] = data['Summary'] # TODO item['isRed'] = data['Impact'] yield item except Exception as e: # print(e) item = ErrorItem() item['code'] = 902 # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的url item['site'] = "选股宝" # 出错的描述 item['desc'] = '解析json数据错误' # 代码报出的错误 item['exception'] = str(e) yield item else: item = ErrorItem() item['code'] = response.status # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的url item['site'] = "选股宝" # 出错的描述 item['desc'] = '响应错误' # 代码报出的错误 item['exception'] = '' yield item
def parse_detail(self, response): soup = BeautifulSoup(response.body) soup.prettify() try: # 获取新闻标题 title = soup.select('h1[class="main-title"]')[0].get_text() # 获取新闻发布时间 date = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime( time.mktime( time.strptime( soup.select('span[class="date"]')[0].get_text(), '%Y年%m月%d日 %H:%M')))) # 终止条件 interval = tools.time_cmp(float(self.scan_id), date) if interval > self.days: print('______________过时新闻________________'.encode( "utf-8").decode(self.decoding)) return # 获取评论数 hot_degree = int( soup.select('a[data-sudaclick="comment_sum_p"]')[0].get_text()) # 获取新闻关键词 keywords = [] try: a_list = soup.find_all('div', attrs={'class': 'keywords'})[0].find_all('a') for item in a_list: keywords.append(item.get_text()) except: pass # 获取新闻URL url = response.url # 获取新闻内容 comView = soup.select('div[class="article"]')[0] # 删除图片和JS try: comView.style.decompose() except: pass try: for i in comView.find_all('script'): i.decompose() for i in comView.find_all('div'): i.decompose() comView.find('p', attrs={ 'class': 'article-editor' }).decompose() except AttributeError: pass article = [] for p in comView.find_all('p'): if p.get_text() is not None: article.append(p.get_text().strip()) article = '\n'.join(article) # 关键词摘要生成 temp_keywords, abstract = tools.sina_keyword_abstract( article, 4, 5) if len(keywords) == 0: keywords = temp_keywords keywords = ' '.join(keywords) print('新浪网: '.encode("utf-8").decode(self.decoding), title.encode("utf-8").decode(self.decoding).strip()) # 封装成item similar_list = self.s.cal_similarities(article) if max(similar_list) > self.threshold: item = NewsItem() item['title'] = title.strip() item['url'] = url.strip() item['net_name'] = '新浪网' item['ent_time'] = date item['keyword'] = keywords.strip() item['digest'] = abstract.strip() item['content'] = article.strip() item['hot_degree'] = str( tools.divide_hot_degree(self.name, hot_degree)) item['scan_id'] = str(self.scan_id) return item except: try: # 获取新闻标题 title = soup.select('h1[id="artibodyTitle"]')[0].get_text() # 获取新闻发布时间 date = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime( time.mktime( time.strptime( soup.select('span[id="pub_date"]') [0].get_text().strip(), '%Y-%m-%d %H:%M:%S')))) # 终止条件 interval = tools.time_cmp(float(self.scan_id), date) if interval > self.days: print('______________过时新闻________________'.encode( "utf-8").decode(self.decoding)) return # 获取评论数 hot_degree = int( soup.select('a[data-sudaclick="comment_sum_p"]') [0].get_text()) # 获取新闻关键词 keywords = [] try: a_list = soup.find_all('p', attrs={'class': 'art_keywords' })[0].find_all('a') for item in a_list: keywords.append(item.get_text()) except: pass # 获取新闻URL url = response.url # 获取新闻内容 comView = soup.select('div[id="artibody"]')[0] # 删除图片和JS try: comView.style.decompose() except: pass try: for i in comView.find_all('script'): i.decompose() for i in comView.find_all('div'): i.decompose() comView.find('p', attrs={ 'class': 'article-editor' }).decompose() except AttributeError: pass # 保存新闻内容 article = [] for p in comView.find_all('p'): if p.get_text() is not None: article.append(p.get_text().strip()) article = '\n'.join(article) # 关键词摘要 temp_keywords, abstract = tools.sina_keyword_abstract( article, 4, 5) if len(keywords) == 0: keywords = temp_keywords keywords = ' '.join(keywords) print('新浪网: '.encode("utf-8").decode(self.decoding), title.encode("utf-8").decode(self.decoding).strip()) # 封装成item similar_list = self.s.cal_similarities(article) if max(similar_list) > self.threshold: item = NewsItem() item['title'] = title.strip() item['url'] = url.strip() item['net_name'] = '新浪网' item['ent_time'] = date item['keyword'] = keywords.strip() item['digest'] = abstract.strip() item['content'] = article.strip() item['hot_degree'] = str( tools.divide_hot_degree(self.name, hot_degree)) item['scan_id'] = str(self.scan_id) return item except: pass
def parse_detail(self, response): soup = BeautifulSoup(response.body) soup.prettify() # 获取新闻标题 title = soup.select('h1[class="main-title"]')[0].get_text() # 获取新闻发布时间 date = soup.select('span[class="date"]')[0].get_text() # 获取新闻内容 article = soup.select('div[class="article"]')[0] # 获取新闻关键词 keywords = [] try: a_list = soup.find_all('div', attrs={'class': 'keywords'})[0].find_all('a') for item in a_list: keywords.append(item.get_text()) except: pass # 获取新闻URL url = response.url # 删除图片和JS try: article.style.decompose() except: pass try: for i in article.find_all('script'): i.decompose() for i in article.find_all('div'): i.decompose() article.find('p', attrs={'class': 'article-editor'}).decompose() except AttributeError: article = article.get_text().strip() # 去除空格 else: article = article.get_text().strip() # 去除空格 temp_keywords, abstract = sina_keyword_abstract(article, 3, 3) if len(keywords) == 0: keywords = temp_keywords keywords = ' '.join(keywords) print('-----------------------------------------------') print('标题:', title) #print(article) print('关键词:', keywords) print('摘要:', end='\n') print(abstract) print('时间:', date) print('新闻URL:', url) print('相似度:', self.s.cal_similarities(article)) print('-----------------------------------------------') # 封装成item similar_list = self.s.cal_similarities(article) if max(similar_list) > self.threshold: print('存在相似,保存入数据库') print( '-----------------------------------------------------------------------------------------------------------------') item = NewsItem() item['title'] = title.strip() item['url'] = url.strip() item['net_name'] = '新浪网' item['ent_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.mktime(time.strptime(date, '%Y年%m月%d日 %H:%M')))) item['keyword'] = keywords.strip() item['digest'] = abstract.strip() item['content'] = article.strip() item['hot_degree'] = '0' item['scan_id'] = str(self.scan_id) return item else: print('没超过阈值,pass') print( '-----------------------------------------------------------------------------------------------------------------') pass
def parse_content(self, response): soup = BeautifulSoup(response.body) # 获取时间 date = response.meta['date'] # 终止条件 interval = time_cmp(float(self.scan_id), time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.mktime( time.strptime(date, "%Y-%m-%dT%H:%M:%S+08:00"))))) if interval > self.days: print('过时') return # 获取标题 title = soup.find('div', attrs={'class': 'mobile_article'}).find('h1').get_text() # 获取文章 article = soup.find('section', attrs={'class': 'textblock'}) try: if article.find('p').get_text().strip().startswith('编者'): article.find('p').decompose() except: pass article = article.get_text().strip() # 获取总结 summary = soup.find('section', attrs={'class': 'summary'}).get_text().strip() # 获取关键词和摘要 keywords, abstract = _36r_keyword_abstract(article, 3, 3) raw_keywords = [] for item in soup.find_all('a', attrs={'class': 'kr-tag-gray'}): raw_keywords.append(item.get_text()) if len(raw_keywords) != 0: keywords = raw_keywords keywords = ' '.join(keywords) print('-----------------------------------------------') print('标题:', title) print('总结:', summary) print('关键词:', keywords) #print(article) print('摘要:', end='') print(abstract) print('url:', response.url) print('时间:', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.mktime(time.strptime(date, "%Y-%m-%dT%H:%M:%S+08:00"))))) print('相似度', self.s.cal_similarities(article)) print('-----------------------------------------------') # 封装成item similar_list = self.s.cal_similarities(article) if max(similar_list) > self.threshold: print('存在相似,保存入数据库') print( '-----------------------------------------------------------------------------------------------------------------') item = NewsItem() item['title'] = title.strip() item['url'] = response.url.strip() item['net_name'] = '36氪' item['ent_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime( time.mktime(time.strptime(date, "%Y-%m-%dT%H:%M:%S+08:00")))) item['keyword'] = keywords.strip() item['digest'] = abstract.strip() item['content'] = article.strip() item['hot_degree'] = '0' item['scan_id'] = str(self.scan_id) return item else: print('没超过阈值,pass') print( '-----------------------------------------------------------------------------------------------------------------') pass
def parse_content(self, response): soup = BeautifulSoup(response.body) # 获取新闻发布时间 date = soup.select('td[class="time"]')[0].get_text().strip() date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.mktime(time.strptime(date, '%Y-%m-%d %H:%M')))) # 终止条件 interval = time_cmp(float(self.scan_id), date) if interval > self.days: print('新闻过时') return # 获取新闻标题 title = soup.select('h1[class="headTit"]')[0].get_text().strip() # 获取新闻导语 leadword = soup.select('div[class="article-lead"]')[0].get_text().strip() # 获取新闻URL url = response.url # 获取新闻内容 article = soup.select('div[class="lph-article-comView"]')[0] keywords = [] try: for i in soup.find('div', attrs={'class': 'related-link clr'}).children: keywords.append(i.string.strip()) except: pass # 删除模版和JS try: [s.extract() for s in article(['script', 'strong'])] except AttributeError: article = fix_content(article.get_text()) # 去除空格 else: article = fix_content(article.get_text()) # 去除空格 temp_keywords, abstract = leiphone_keyword_abstract(article, 3, 3) if len(keywords) == 0: keywords = temp_keywords keywords = ' '.join(keywords) print('-----------------------------------------------') print('标题:', title) print(leadword) #导语 #print(article) print('关键词:', keywords) print('摘要:', end='') print(abstract) print('时间:', date) print('新闻URL:', url) print('相似度:', self.s.cal_similarities(article)) print('-----------------------------------------------') # 封装成item similar_list = self.s.cal_similarities(article) if max(similar_list) > self.threshold: print('存在相似,保存入数据库') print( '-----------------------------------------------------------------------------------------------------------------') item = NewsItem() item['ent_time'] = date item['title'] = title.strip() item['url'] = url.strip() item['net_name'] = '雷锋网' item['keyword'] = keywords.strip() item['digest'] = abstract.strip() item['content'] = article.strip() item['hot_degree'] = '0' item['scan_id'] = self.scan_id return item else: print('没超过阈值,pass') print( '-----------------------------------------------------------------------------------------------------------------') pass
def parse_item(self, response): if response.status == 200: lis = response.xpath('//ul[@class="live-list"]/li') if lis is None or len(lis) == 0: item = ErrorItem() item['code'] = 801 # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的网站 item['site'] = "每经网" # 出错的描述 item['desc'] = '未找到html元素' # 代码报出的错误 item['exception'] = '' return item try: riqi = response.xpath( '//p[@class="live"]/span/text()').getall() # 如:2019年05月20日 date = '' for temp in riqi: if "年" in temp: date = temp.replace("\n", "").replace("\n\r", "").replace( "\r\n", "").replace("\r", "").strip() break for li in lis: i = NewsItem() i['source'] = "nbd" timeStamp = '' try: temp = (li.xpath( './div[@class="li-title"]/p/span/text()').get()) # 如:17:44:42 temp = temp.replace("\n", "").replace("\n\r", "").replace( "\r\n", "").replace("\r", "").strip() temp = date + temp # 如:2019年05月16日 18:26:27 d = datetime.datetime.strptime(temp, "%Y年%m月%d日%H:%M:%S") t = d.timetuple() timeStamp = time.mktime(t) except Exception as e: print(e) i['pubDate'] = "" else: i['pubDate'] = timeStamp i['title'] = "" i['content'] = li.xpath( './div[@class="li-text"]/a/text()').get() # TODO i['isRed'] = 0 yield i except Exception as e: item = ErrorItem() item['code'] = 802 # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的网站 item['site'] = "每经网" # 出错的描述 item['desc'] = '解析html标签错误' # 代码报出的错误 item['exception'] = str(e) yield item else: item = ErrorItem() item['code'] = response.status # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的网站 item['site'] = "每经网" # 出错的描述 item['desc'] = '响应错误' # 代码报出的错误 item['exception'] = '' yield item
def parse_item(self, response): if response.status == 200: try: temp = re.findall('__NEXT_DATA__.*module', response.body.decode(), re.S)[0].replace( '__NEXT_DATA__ =', '').replace('__NEXT_DATA__', '').replace('module', '').strip() data_list = json.loads('[' + temp + ']') # print(data_list) except Exception as e: print(e) item = ErrorItem() item['code'] = 800 # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的网站 item['site'] = "财联社" # 出错的描述 item['desc'] = '响应的json数据错误' # 代码报出的错误 item['exception'] = str(e) yield item else: try: for data in data_list[0]["props"]['initialState'][ 'telegraph']['dataList']: item = NewsItem() item['source'] = 'cls' item['pubDate'] = data['modified_time'] item['title'] = data['title'] # if '【' in data['content'] and '】' in data['content']: item['content'] = re.findall( '】.*', data['content'])[0].replace("】", '') else: item['content'] = data['content'] # TODO item['isRed'] = 0 yield item except Exception as e: item = ErrorItem() item['code'] = 902 # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的网站 item['site'] = "财联社" # 出错的描述 item['desc'] = '解析json数据错误' # 代码报出的错误 item['exception'] = str(e) yield item else: item = ErrorItem() item['code'] = response.status # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的网站 item['site'] = "财联社" # 出错的描述 item['desc'] = '响应错误' # 代码报出的错误 item['exception'] = '' yield item