Ejemplo n.º 1
0
    def parse_item(self, response):

        if response.status == 200:
            try:
                data = json.loads('['+response.body.decode()+']')
                print(data)
            except Exception as e:
                print(e)
                item = ErrorItem()
                item['code'] = response.status
                # 出错的页面
                item['url'] = 901
                # 出错的时间
                # item['timestamp'] = time.time()
                # 出错的url
                item['site'] = "httpbin"
                # 出错的描述
                item['desc'] = '响应的json数据错误'
                # 代码报出的错误
                item['exception'] =  str(e)
                return item
            else:
                try:
                    for item in data:
                        i = NewsItem()
                        i['source'] = "httpbin"
                        # print(item)
                        i['pubDate'] = ""
                        i['title'] = ""
                        i['content'] = item['origin']
                        yield i
                except Exception as e:
                    item = ErrorItem()
                    item['code'] = response.status
                    # 出错的页面
                    item['url'] = 902
                    # 出错的时间
                    # item['timestamp'] = time.time()
                    # 出错的url
                    item['site'] = "httpbin"
                    # 出错的描述
                    item['desc'] = '解析json数据错误'
                    # 代码报出的错误
                    item['exception'] =  str(e)
                    yield item
        else:
            item = ErrorItem()
            item['code'] = response.status
            # 出错的页面
            item['url'] = response.url
            # 出错的时间
            # item['timestamp'] = time.time()
            # 出错的url
            item['site'] = "httpbin"
            # 出错的描述
            item['desc'] = '响应错误'
            # 代码报出的错误
            item['exception'] = ''
            yield item
Ejemplo n.º 2
0
 def parse(self, response):
     item = NewsItem()
     movieList = response.xpath(
         '//div[@class="col01L"]/div[@class="box_02"]/ul/li')
     for movie in movieList:
         link = movie.xpath('.//a/@href').extract_first()
         item["link"] = link
         yield Request(link, callback=self.parse2, meta=item)
Ejemplo n.º 3
0
 def parse_item(self, response):
     if response.status == 200:
         try:
             data = json.loads(response.body.decode())
         except Exception as e:
             print(e)
             item = ErrorItem()
             item['code'] = response.status
             # 出错的页面
             item['url'] = 901
             # 出错的时间
             # item['timestamp'] = time.time()
             # 出错的url
             item['site'] = "e公司"
             # 出错的描述
             item['desc'] = '响应的json数据错误'
             # 代码报出的错误
             item['exception'] = str(e)
             return item
         else:
             try:
                 for item in data['data']:
                     i = NewsItem()
                     i['source'] = "egs"
                     # print(item)
                     i['pubDate'] = item.get('pageTime', "")
                     i['title'] = item.get('title', "")
                     i['content'] = item.get('content', "")
                     i['isRed'] = item.get('isRed', 0)
                     yield i
             except Exception as e:
                 item = ErrorItem()
                 item['code'] = response.status
                 # 出错的页面
                 item['url'] = 902
                 # 出错的时间
                 # item['timestamp'] = time.time()
                 # 出错的url
                 item['site'] = "e公司"
                 # 出错的描述
                 item['desc'] = '解析json数据错误'
                 # 代码报出的错误
                 item['exception'] = str(e)
                 yield item
     else:
         item = ErrorItem()
         item['code'] = response.status
         # 出错的页面
         item['url'] = response.url
         # 出错的时间
         # item['timestamp'] = time.time()
         # 出错的url
         item['site'] = "e公司"
         # 出错的描述
         item['desc'] = '响应错误'
         # 代码报出的错误
         item['exception'] = ''
         yield item
Ejemplo n.º 4
0
 def parse_comment(self, response):
     result = json.loads(response.text)
     item = NewsItem()
     item['source'] = response.meta['source']
     item['date'] = response.meta['date']
     item['newsId'] = response.meta['newsId']
     item['url'] = response.meta['url']
     item['title'] = response.meta['title']
     item['contents'] = response.meta['contents']
     item['comments'] = result['cmtAgainst'] + result['cmtVote'] + result['rcount']
     item['time'] = response.meta['time']
     return item
Ejemplo n.º 5
0
 def parse_content(self, response):
     try:
         soup = BeautifulSoup(response.body)
         date = time.strftime(
             "%Y-%m-%d %H:%M:%S",
             time.localtime(
                 time.mktime(
                     time.strptime(response.meta['publish_time'],
                                   "%Y-%m-%d %H:%M:%S"))))
         # 终止条件
         interval = tools.time_cmp(float(self.scan_id), date)
         if interval > self.days:
             print('______________过时新闻________________'.encode(
                 "utf-8").decode(self.decoding))
             return
         title = soup.find('div', attrs={
             'class': 'LEFT'
         }).find('h1').get_text()
         hot_degree = int(response.meta['comment_num'])
         keywords = ' '.join(response.meta['keywords'])
         # 删除div节点
         soup.find('div', attrs={
             'class': 'content-article'
         }).find('div').decompose()
         article = []
         for p in soup.find('div', attrs={
                 'class': 'content-article'
         }).find_all('p'):
             if p.get_text() is not None:
                 article.append(p.get_text().strip())
         article = '\n'.join(article)
         abstract = tools.tencent_keyword_abstract(article, 4)
         # 封装成item
         similar_list = self.s.cal_similarities(article)
         print('腾讯网: '.encode("utf-8").decode(self.decoding),
               title.encode("utf-8").decode(self.decoding).strip())
         if max(similar_list) > self.threshold:
             item = NewsItem()
             item['title'] = title.strip()
             item['url'] = response.url.strip()
             item['net_name'] = '腾讯'
             item['ent_time'] = date
             item['keyword'] = keywords.strip()
             item['digest'] = abstract.strip()
             item['content'] = article.strip()
             item['hot_degree'] = str(
                 tools.divide_hot_degree(self.name, hot_degree))
             item['scan_id'] = str(self.scan_id)
             return item
     except:
         pass
Ejemplo n.º 6
0
    def parse_page(self, response):
        if response.status == 200:
            lis = response.xpath('/html/body/div/div/ul/li')
            if lis is None or len(lis) == 0:
                item = ErrorItem()
                item['code'] = 801
                # 出错的页面
                item['url'] = response.url
                # 出错的时间
                item['date'] = time.time()
                # 出错的网站
                item['site'] = "中证网"
                # 出错的描述
                item['desc'] = '未找到html元素'
                # 代码报出的错误
                item['exception'] = ''
                return item
            try:
                for li in lis:
                    item = NewsItem()
                    item['source'] = "cs"

                    # temp = li.xpath('./span/text()').get().strip()  # 19-05-16 18:43
                    # temp = '20' + temp
                    # d = datetime.datetime.strptime(temp, "%Y-%m-%d %H:%M")
                    # t = d.timetuple()
                    # timeStamp = int(time.mktime(t))
                    #
                    # item['pubDate'] = timeStamp

                    item['pubDate'] =''
                    item['title'] = li.xpath('./a/text()').get()
                    url = r'http://www.cs.com.cn/sylm/jsbd/' + li.xpath('./a/@href').get()

                    yield scrapy.Request(url=url, meta={'item': item}, callback=self.parse_item, dont_filter=True)

            except Exception as e:
                item = ErrorItem()
                item['code'] = 802
                # 出错的页面
                item['url'] = response.url
                # 出错的时间
                # item['timestamp'] = time.time()
                # 出错的网站
                item['site'] = "中证网"
                # 出错的描述
                item['desc'] = '解析html元素错误'
                # 代码报出的错误
                item['exception'] =  str(e)
                yield item
Ejemplo n.º 7
0
 def parse_comment(self, response):
     if re.findall(r'"total":(\d*)\,', response.text):
         comments = re.findall(r'"total":(\d*)\,', response.text)[0]
     else:
         comments = 0
     item = NewsItem()
     item['source'] = response.meta['source']
     item['time'] = response.meta['time']
     item['date'] = response.meta['date']
     item['contents'] = response.meta['contents']
     item['title'] = response.meta['title']
     item['url'] = response.meta['url']
     item['newsId'] = response.meta['newsId']
     item['comments'] = comments
     return item
Ejemplo n.º 8
0
    def parse_news(self, response):
        sel = Selector(response)
        if sel.xpath('//*[@id="Main-Article-QQ"]/div/div[1]/div[1]/div[1]/h1/text()'):
            title = sel.xpath('//*[@id="Main-Article-QQ"]/div/div[1]/div[1]/div[1]/h1/text()').extract()[0]
        elif sel.xpath('//*[@id="C-Main-Article-QQ"]/div/div[1]/div[1]/div[1]/h1/text()'):
            title = sel.xpath('//*[@id="C-Main-Article-QQ"]/div/div[1]/div[1]/div[1]/h1/text()').extract()[0]
        elif sel.xpath('//*[@id="ArticleTit"]/text()'):
            title = sel.xpath('//*[@id="ArticleTit"]/text()').extract()[0]
        else:
            title = 'unknown'
        pattern = re.match(self.url_pattern, str(response.url))
        source = 'tencent'
        date = pattern.group(2)
        date = date[0:4] + '/' + date[4:6] + '/' + date[6:]
        newsId = pattern.group(3)
        url = response.url
        if sel.xpath('//*[@id="Main-Article-QQ"]/div/div[1]/div[1]/div[1]/div/div[1]/span[3]/text()'):
            time_ = sel.xpath('//*[@id="Main-Article-QQ"]/div/div[1]/div[1]/div[1]/div/div[1]/span[3]/text()').extract()[0]
        else:
            time_ = 'unknown'
        contents = ListCombiner(sel.xpath('//p/text()').extract()[:-8])

        if response.xpath('//*[@id="Main-Article-QQ"]/div/div[1]/div[2]/script[2]/text()'):
            cmt = response.xpath('//*[@id="Main-Article-QQ"]/div/div[1]/div[2]/script[2]/text()').extract()[0]
            if re.findall(r'cmt_id = (\d*);', cmt):
                cmt_id = re.findall(r'cmt_id = (\d*);', cmt)[0]
                comment_url = 'http://coral.qq.com/article/{}/comment?commentid=0&reqnum=1&tag=&callback=mainComment&_=1389623278900'.format(cmt_id)
                yield Request(comment_url, self.parse_comment, meta={'source': source,
                                                                     'date': date,
                                                                     'newsId': newsId,
                                                                     'url': url,
                                                                     'title': title,
                                                                     'contents': contents,
                                                                     'time': time_
                                                                     })
            else:
                item = NewsItem()
                item['source'] = source
                item['time'] = time_
                item['date'] = date
                item['contents'] = contents
                item['title'] = title
                item['url'] = url
                item['newsId'] = newsId
                item['comments'] = 0
                return item
Ejemplo n.º 9
0
 def parse2(self, response):
     item2 = NewsItem()
     item = response.meta
     content = response.xpath('//div[@id="artical"]')
     #http://finance.ifeng.com/a/20180806/16429540_0.shtml
     if 'finance.ifeng.com/a' not in item['link'] or '.shtml' not in item[
             'link']:
         return
     # 时间  判断20分钟内的新闻不保存
     publishTime = content[0].xpath(
         './/span[@itemprop="datePublished"]/text()').extract_first()
     if publishTime < (
         (datetime.datetime.now() -
          datetime.timedelta(minutes=19)).strftime('%Y-%m-%d %H:%M:%S')):
         return
     item2["createTime"] = publishTime
     #title, artical, origin, link
     # 来源
     origin = content[0].xpath(
         './/span[@itemprop="publisher"]/span/a/text()').extract_first()
     if origin is None:
         origin = content[0].xpath(
             './/span[@itemprop="publisher"]/span/text()').extract_first()
     item2["source"] = origin
     #标题
     title = content[0].xpath(
         './/h1[@itemprop="headline"]/text()').extract_first()
     item2["title"] = title
     #内容
     artical = content[0].xpath('.//div[@id="artical_real"]').xpath(
         'string(.)').extract_first()
     item2["content"] = artical.replace('\r\n',
                                        '').replace(' ',
                                                    '').replace('\n',
                                                                '')[0:200]
     item2["link"] = item["link"]
     item2["uuid"] = uuid.uuid1()
     yield item2
def news_parser(news_config: dict, response: Response):
    title = _extract([XPathFirst(xpath_str) for xpath_str in news_config['title_xpath_list']], response)
    author = _extract([XPathFirst(xpath_str) for xpath_str in news_config['author_xpath_list']], response)
    publish_time = _extract([XPathFirst(xpath_str) for xpath_str in news_config['publish_time_xpath_list']], response)
    rich_content_origin = _extract([XPathFirst(xpath_str) for xpath_str in news_config['content_xpath_list']], response)
    if not (title and publish_time and rich_content_origin):
        logging.debug(f'[NOT NEWS]<title>:{title}, '
                      f'<publish_time>:{publish_time} '
                      f'<content_text>:{bool(rich_content_origin)}'
                      f'<url>:{response.url}')
        return False, None
    cleaned_content, cleaned_content_text = clean_html_content_text(rich_content_origin)
    logging.debug(f'[IS NEWS]<title>:{title}, '
                  f'<publish_time>:{publish_time} '
                  f'<content_text>:{cleaned_content_text}'
                  f'<url>:{response.url}')
    return True, NewsItem(
        source=news_config['source'],
        title=title,
        author=author,
        publish_time=publish_time,
        content=cleaned_content_text,
        rich_content=cleaned_content,
    )
Ejemplo n.º 11
0
 def parse_detail(self, response):
     newsitem = NewsItem()
     selector = Selector(response)
     current_url = response.url  #获取当前链接
     # print(current_url)
     url = response.meta['url']
     cate = response.meta['cate']
     source = response.meta['source']
     tag = response.meta['tag']
     tags = tag.split(':')
     xpath_rule = './/div[@' + tags[0] + '=' + "'" + tags[1] + "'" + ']/p'
     res = selector.xpath(xpath_rule)
     content = ''
     for c in res:
         content += c.xpath('string(.)').extract_first() + '\r\n'
     content = re.sub('[\u3000 \xa0 \\t \u200b  ■]+', '', content)
     content = re.sub(r'showPlayer.*?;', '', content)  #过滤人民网内特殊字符
     content = '\r\n'.join([
         c.replace('\n', '') for c in content.split('\r\n')
         if c.strip() and len(c.strip()) > 20
     ])
     if content:
         newsitem['url'] = current_url
         newsitem['content'] = content
         newsitem['source'] = source
         newsitem['category'] = cate
         yield newsitem
     else:
         yield scrapy.Request(url=current_url,
                              meta={
                                  'url': url,
                                  'cate': cate,
                                  'source': source,
                                  'tag': tag
                              },
                              callback=self.parse)
Ejemplo n.º 12
0
    def parse_item(self, response):
        if response.status == 200:

            lis = response.xpath('//ul[@class="nf-list"]/li')
            if lis is None or len(lis) == 0:
                item = ErrorItem()
                item['code'] = 801
                # 出错的页面
                item['url'] = response.url
                # 出错的时间
                # item['timestamp'] = time.time()
                # 出错的网站
                item['site'] = "上证快讯"
                # 出错的描述
                item['desc'] = '未找到html元素'
                # 代码报出的错误
                item['exception'] = ''
                return item
            try:
                # 日期:2019年05月16日
                # riqi = response.xpath('//div[@class="nf-head"]/p/text()').get().strip()
                for li in lis:
                    item = NewsItem()
                    item['source'] = "cnstock"
                    # temp = li.xpath('./p[1]/text()').get()  # 如:20:30
                    # temp = riqi + temp  # 2019年05月16日20:30
                    #
                    # d = datetime.datetime.strptime(temp, "%Y年%m月%d日%H:%M")
                    # t = d.timetuple()
                    # timeStamp = int(time.mktime(t))
                    #
                    # item['pubDate'] = timeStamp

                    item['pubDate'] = ''
                    title_conent = li.xpath('./p[2]/a/text()').get()

                    # 如:
                    '''
                    【压垮乐视网的最后一根稻草竟然是它!】15日,进入暂停上市状态第三天的乐视网披露,因乐视体育经营不利导致增资协议中的对赌条款失败,乐视体育股东之一的前海思拓提出的涉及回购融资股权的仲裁申请,得到了北京仲裁委员会的支持。
                    '''
                    item['title'] = (re.findall('【.*】', title_conent)[0]).replace('【', '').replace('】', '')
                    item['content'] = re.findall('】.*', title_conent)[0].replace('】', '')


                    item['isRed'] = 0
                    yield item
            except Exception as e:
                item = ErrorItem()
                item['code'] = 802
                # 出错的页面
                item['url'] = response.url
                # 出错的时间
                # item['timestamp'] = time.time()
                # 出错的网站
                item['site'] = "上证快讯"
                # 出错的描述
                item['desc'] = '解析html标签错误'
                # 代码报出的错误
                item['exception'] =  str(e)
                yield item

        else:
            item = ErrorItem()
            item['code'] = response.status
            # 出错的页面
            item['url'] = response.url
            # 出错的时间
            # item['timestamp'] = time.time()
            # 出错的网站
            item['site'] = "上证快讯"
            # 出错的描述
            item['desc'] = '响应错误'
            # 代码报出的错误
            item['exception'] = ''
            yield item
Ejemplo n.º 13
0
    def parse_item(self, response):

        if response.status == 200:
            try:
                data_list = json.loads(response.body.decode())
                # print(data_list)
            except Exception as e:
                print(e)
                item = ErrorItem()
                item['code'] = 800
                # 出错的页面
                item['url'] = response.url
                # 出错的时间
                item['timestamp'] = time.time()
                # 出错的url
                item['site'] = "第一财经"
                # 出错的描述
                item['desc'] = '响应的json数据错误'
                # 代码报出的错误
                item['exception'] = str(e)
                yield item
            else:
                try:
                    for data in data_list:
                        item = NewsItem()
                        item['source'] = 'yicai'
                        date = data['datekey'] + " " + data[
                            'hm']  # 如:2019.05.16 20:43
                        # print('<<<<<<<<<<< ' + temp + ' >>>>>>>>>>>')
                        d = datetime.datetime.strptime(date, "%Y.%m.%d %H:%M")
                        t = d.timetuple()
                        item['pubDate'] = int(time.mktime(t))

                        # print(item)
                        title_conent = data['newcontent']

                        # 如:【传化智联:非公开发行股票方案到期失效】 传化智联5月16日晚间公告,公司于2017年度股东大会审议通过《关于公司非公开发行股票方案的议案》,因资本市场环境变化等因素,公司此次非公开发行股票事项尚未取得实质进展。目前,此次非公开发行股票方案到期自动失效。 ",
                        item['title'] = re.findall('【.*】',
                                                   title_conent)[0].replace(
                                                       '【',
                                                       '').replace('】', '')
                        item['content'] = re.findall('】.*',
                                                     title_conent)[0].replace(
                                                         '】', '')

                        # TODO
                        item['isRed'] = 0
                        yield item
                except Exception as e:
                    item = ErrorItem()
                    item['code'] = 902
                    # 出错的页面
                    item['url'] = response.url
                    # 出错的时间
                    # item['timestamp'] = time.time()
                    # 出错的url
                    item['site'] = "第一财经"
                    # 出错的描述
                    item['desc'] = '解析json数据错误'
                    # 代码报出的错误
                    item['exception'] = str(e)
                    yield item
        else:
            item = ErrorItem()
            item['code'] = response.status
            # 出错的页面
            item['url'] = response.url
            # 出错的时间
            # item['timestamp'] = time.time()
            # 出错的url
            item['site'] = "第一财经"
            # 出错的描述
            item['desc'] = '响应错误'
            # 代码报出的错误
            item['exception'] = ''
            yield item
Ejemplo n.º 14
0
 def parse_content(self, response):
     try:
         soup = BeautifulSoup(response.body)
         # 获取时间
         date = response.meta['date']
         # 终止条件
         interval = tools.time_cmp(
             float(self.scan_id),
             time.strftime(
                 "%Y-%m-%d %H:%M:%S",
                 time.localtime(
                     time.mktime(
                         time.strptime(date, "%Y-%m-%dT%H:%M:%S+08:00")))))
         if interval > self.days:
             print('______________过时新闻________________'.encode(
                 "utf-8").decode(self.decoding))
             return
         # 获取标题
         title = soup.find('div', attrs={
             'class': 'mobile_article'
         }).find('h1').get_text()
         # 获取文章
         textblock = soup.find('section', attrs={'class': 'textblock'})
         try:
             if textblock.find('p').get_text().strip().startswith('编者'):
                 textblock.find('p').find('p').decompose()
         except:
             pass
         article = []
         for p in textblock.find_all('p'):
             if p.get_text() is not None:
                 article.append(p.get_text().strip())
         article = '\n'.join(article)
         # 获取总结
         summary = soup.find('section', attrs={
             'class': 'summary'
         }).get_text().strip()
         # 获取点赞数
         hot_degree = int(
             soup.find('b', attrs={
                 'class': 'count-min'
             }).get_text().strip())
         # 获取关键词和摘要
         keywords, abstract = tools._36r_keyword_abstract(article, 3, 3)
         raw_keywords = []
         for item in soup.find_all('a', attrs={'class': 'kr-tag-gray'}):
             raw_keywords.append(item.get_text())
         if len(raw_keywords) != 0:
             keywords = raw_keywords
         keywords = ' '.join(keywords)
         print('36氪: '.encode("utf-8").decode(self.decoding),
               title.encode("utf-8").decode(self.decoding).strip())
         # 封装成item
         similar_list = self.s.cal_similarities(article)
         if max(similar_list) > self.threshold:
             item = NewsItem()
             item['title'] = title.strip()
             item['url'] = response.url.strip()
             item['net_name'] = '36氪'
             item['ent_time'] = time.strftime(
                 "%Y-%m-%d %H:%M:%S",
                 time.localtime(
                     time.mktime(
                         time.strptime(date, "%Y-%m-%dT%H:%M:%S+08:00"))))
             item['keyword'] = keywords.strip()
             item['digest'] = abstract.strip()
             item['content'] = article.strip()
             item['hot_degree'] = str(
                 tools.divide_hot_degree(self.name, hot_degree))
             item['scan_id'] = str(self.scan_id)
             return item
     except:
         pass
Ejemplo n.º 15
0
    def parse_content(self, response):
        soup = BeautifulSoup(response.body)
        # 获取新闻发布时间
        date = soup.select('td[class="time"]')[0].get_text().strip()
        date = time.strftime(
            "%Y-%m-%d %H:%M:%S",
            time.localtime(time.mktime(time.strptime(date, '%Y-%m-%d %H:%M'))))
        # 终止条件
        interval = tools.time_cmp(float(self.scan_id), date)
        if interval > self.days:
            print('______________过时新闻________________'.encode("utf-8").decode(
                self.decoding))
            return

        # 获取新闻标题
        title = soup.select('h1[class="headTit"]')[0].get_text().strip()
        # 获取新闻导语
        leadword = soup.select(
            'div[class="article-lead"]')[0].get_text().strip()
        # 获取收藏数
        hot_degree = int(
            soup.find('a', attrs={
                'class': 'collect collect-no'
            }).find('span').get_text().strip())
        # 获取新闻URL
        url = response.url
        # 获取关键词
        keywords = []
        try:
            for i in soup.find('div', attrs={
                    'class': 'related-link clr'
            }).children:
                keywords.append(i.string.strip())
        except:
            pass
        # 获取新闻内容
        comView = soup.select('div[class="lph-article-comView"]')[0]
        # 删除模版和JS
        try:
            [s.extract() for s in comView(['script', 'strong'])]
        except AttributeError:
            pass
        article = []
        for p in comView.find_all('p'):
            if p.get_text() is not None:
                article.append(p.get_text().strip())
        article = '\n'.join(article)
        temp_keywords, abstract = tools.leiphone_keyword_abstract(
            article, 3, 3)
        if len(keywords) == 0:
            keywords = temp_keywords
        keywords = ' '.join(keywords)

        print('雷锋网: '.encode("utf-8").decode(self.decoding),
              title.encode("utf-8").decode(self.decoding).strip())
        # 封装成item
        similar_list = self.s.cal_similarities(article)
        if max(similar_list) > self.threshold:
            item = NewsItem()
            item['ent_time'] = date
            item['title'] = title.strip()
            item['url'] = url.strip()
            item['net_name'] = '雷锋网'
            item['keyword'] = keywords.strip()
            item['digest'] = abstract.strip()
            item['content'] = article.strip()
            item['hot_degree'] = str(
                tools.divide_hot_degree(self.name, hot_degree))
            item['scan_id'] = self.scan_id
            return item
Ejemplo n.º 16
0
    def parse_item(self, response):

        if response.status == 200:
            try:
                # 不是格式正确的json,一前一后需要加上'[' ']',
                data_list = json.loads('[' + response.body.decode() + ']')
                # print(data_list)
            except Exception as e:
                # print(e)
                item = ErrorItem()
                item['code'] = 901
                # 出错的页面
                item['url'] = response.url
                # 出错的时间
                # item['timestamp'] = time.time()
                # 出错的url
                item['site'] = "选股宝"
                # 出错的描述
                item['desc'] = '响应的json数据错误'
                # 代码报出的错误
                item['exception'] =  str(e)
                yield item
            else:
                try:
                    for data in data_list[0]["NewMsgs"]:
                        item = NewsItem()
                        # item['flag'] = 1
                        item['source'] = 'xuangubao'
                        item['pubDate'] = data['UpdatedAtInSec']
                        item['title'] = data['Title']
                        item['content'] = data['Summary']
                        # TODO
                        item['isRed'] = data['Impact']
                        yield item

                except Exception as e:
                    # print(e)
                    item = ErrorItem()
                    item['code'] = 902
                    # 出错的页面
                    item['url'] = response.url
                    # 出错的时间
                    # item['timestamp'] = time.time()
                    # 出错的url
                    item['site'] = "选股宝"
                    # 出错的描述
                    item['desc'] = '解析json数据错误'
                    # 代码报出的错误
                    item['exception'] =  str(e)
                    yield item

        else:
            item = ErrorItem()
            item['code'] = response.status
            # 出错的页面
            item['url'] = response.url
            # 出错的时间
            # item['timestamp'] = time.time()
            # 出错的url
            item['site'] = "选股宝"
            # 出错的描述
            item['desc'] = '响应错误'
            # 代码报出的错误
            item['exception'] = ''
            yield item
Ejemplo n.º 17
0
    def parse_detail(self, response):
        soup = BeautifulSoup(response.body)
        soup.prettify()
        try:
            # 获取新闻标题
            title = soup.select('h1[class="main-title"]')[0].get_text()
            # 获取新闻发布时间
            date = time.strftime(
                "%Y-%m-%d %H:%M:%S",
                time.localtime(
                    time.mktime(
                        time.strptime(
                            soup.select('span[class="date"]')[0].get_text(),
                            '%Y年%m月%d日 %H:%M'))))
            # 终止条件
            interval = tools.time_cmp(float(self.scan_id), date)
            if interval > self.days:
                print('______________过时新闻________________'.encode(
                    "utf-8").decode(self.decoding))
                return
            # 获取评论数
            hot_degree = int(
                soup.select('a[data-sudaclick="comment_sum_p"]')[0].get_text())
            # 获取新闻关键词
            keywords = []
            try:
                a_list = soup.find_all('div',
                                       attrs={'class':
                                              'keywords'})[0].find_all('a')
                for item in a_list:
                    keywords.append(item.get_text())
            except:
                pass
            # 获取新闻URL
            url = response.url
            # 获取新闻内容
            comView = soup.select('div[class="article"]')[0]
            # 删除图片和JS
            try:
                comView.style.decompose()
            except:
                pass
            try:
                for i in comView.find_all('script'):
                    i.decompose()
                for i in comView.find_all('div'):
                    i.decompose()
                comView.find('p', attrs={
                    'class': 'article-editor'
                }).decompose()
            except AttributeError:
                pass
            article = []
            for p in comView.find_all('p'):
                if p.get_text() is not None:
                    article.append(p.get_text().strip())
            article = '\n'.join(article)
            # 关键词摘要生成
            temp_keywords, abstract = tools.sina_keyword_abstract(
                article, 4, 5)
            if len(keywords) == 0:
                keywords = temp_keywords
            keywords = ' '.join(keywords)

            print('新浪网: '.encode("utf-8").decode(self.decoding),
                  title.encode("utf-8").decode(self.decoding).strip())
            # 封装成item
            similar_list = self.s.cal_similarities(article)
            if max(similar_list) > self.threshold:
                item = NewsItem()
                item['title'] = title.strip()
                item['url'] = url.strip()
                item['net_name'] = '新浪网'
                item['ent_time'] = date
                item['keyword'] = keywords.strip()
                item['digest'] = abstract.strip()
                item['content'] = article.strip()
                item['hot_degree'] = str(
                    tools.divide_hot_degree(self.name, hot_degree))
                item['scan_id'] = str(self.scan_id)
                return item
        except:
            try:
                # 获取新闻标题
                title = soup.select('h1[id="artibodyTitle"]')[0].get_text()
                # 获取新闻发布时间
                date = time.strftime(
                    "%Y-%m-%d %H:%M:%S",
                    time.localtime(
                        time.mktime(
                            time.strptime(
                                soup.select('span[id="pub_date"]')
                                [0].get_text().strip(), '%Y-%m-%d %H:%M:%S'))))
                # 终止条件
                interval = tools.time_cmp(float(self.scan_id), date)
                if interval > self.days:
                    print('______________过时新闻________________'.encode(
                        "utf-8").decode(self.decoding))
                    return
                # 获取评论数
                hot_degree = int(
                    soup.select('a[data-sudaclick="comment_sum_p"]')
                    [0].get_text())
                # 获取新闻关键词
                keywords = []
                try:
                    a_list = soup.find_all('p',
                                           attrs={'class': 'art_keywords'
                                                  })[0].find_all('a')
                    for item in a_list:
                        keywords.append(item.get_text())
                except:
                    pass
                # 获取新闻URL
                url = response.url
                # 获取新闻内容
                comView = soup.select('div[id="artibody"]')[0]
                # 删除图片和JS
                try:
                    comView.style.decompose()
                except:
                    pass
                try:
                    for i in comView.find_all('script'):
                        i.decompose()
                    for i in comView.find_all('div'):
                        i.decompose()
                        comView.find('p', attrs={
                            'class': 'article-editor'
                        }).decompose()
                except AttributeError:
                    pass
                # 保存新闻内容
                article = []
                for p in comView.find_all('p'):
                    if p.get_text() is not None:
                        article.append(p.get_text().strip())
                article = '\n'.join(article)
                # 关键词摘要
                temp_keywords, abstract = tools.sina_keyword_abstract(
                    article, 4, 5)
                if len(keywords) == 0:
                    keywords = temp_keywords
                keywords = ' '.join(keywords)

                print('新浪网: '.encode("utf-8").decode(self.decoding),
                      title.encode("utf-8").decode(self.decoding).strip())
                # 封装成item
                similar_list = self.s.cal_similarities(article)
                if max(similar_list) > self.threshold:
                    item = NewsItem()
                    item['title'] = title.strip()
                    item['url'] = url.strip()
                    item['net_name'] = '新浪网'
                    item['ent_time'] = date
                    item['keyword'] = keywords.strip()
                    item['digest'] = abstract.strip()
                    item['content'] = article.strip()
                    item['hot_degree'] = str(
                        tools.divide_hot_degree(self.name, hot_degree))
                    item['scan_id'] = str(self.scan_id)
                    return item
            except:
                pass
Ejemplo n.º 18
0
    def parse_detail(self, response):
        soup = BeautifulSoup(response.body)
        soup.prettify()
        # 获取新闻标题
        title = soup.select('h1[class="main-title"]')[0].get_text()
        # 获取新闻发布时间
        date = soup.select('span[class="date"]')[0].get_text()
        # 获取新闻内容
        article = soup.select('div[class="article"]')[0]
        # 获取新闻关键词
        keywords = []
        try:
            a_list = soup.find_all('div', attrs={'class': 'keywords'})[0].find_all('a')
            for item in a_list:
                keywords.append(item.get_text())
        except:
            pass
        # 获取新闻URL
        url = response.url
        # 删除图片和JS
        try:
            article.style.decompose()
        except:
            pass
        try:
            for i in article.find_all('script'):
                i.decompose()
            for i in article.find_all('div'):
                i.decompose()
            article.find('p', attrs={'class': 'article-editor'}).decompose()
        except AttributeError:
            article = article.get_text().strip()  # 去除空格
        else:
            article = article.get_text().strip()  # 去除空格
        temp_keywords, abstract = sina_keyword_abstract(article, 3, 3)
        if len(keywords) == 0:
            keywords = temp_keywords
        keywords = ' '.join(keywords)
        print('-----------------------------------------------')
        print('标题:', title)
        #print(article)
        print('关键词:', keywords)
        print('摘要:', end='\n')
        print(abstract)
        print('时间:', date)
        print('新闻URL:', url)
        print('相似度:', self.s.cal_similarities(article))
        print('-----------------------------------------------')

        # 封装成item
        similar_list = self.s.cal_similarities(article)
        if max(similar_list) > self.threshold:
            print('存在相似,保存入数据库')
            print(
                '-----------------------------------------------------------------------------------------------------------------')
            item = NewsItem()
            item['title'] = title.strip()
            item['url'] = url.strip()
            item['net_name'] = '新浪网'
            item['ent_time'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                             time.localtime(time.mktime(time.strptime(date, '%Y年%m月%d日 %H:%M'))))

            item['keyword'] = keywords.strip()
            item['digest'] = abstract.strip()
            item['content'] = article.strip()
            item['hot_degree'] = '0'
            item['scan_id'] = str(self.scan_id)
            return item
        else:
            print('没超过阈值,pass')
            print(
                '-----------------------------------------------------------------------------------------------------------------')
            pass
Ejemplo n.º 19
0
    def parse_content(self, response):
        soup = BeautifulSoup(response.body)
        # 获取时间
        date = response.meta['date']
        # 终止条件
        interval = time_cmp(float(self.scan_id), time.strftime("%Y-%m-%d %H:%M:%S",
                                                               time.localtime(time.mktime(
                                                                   time.strptime(date, "%Y-%m-%dT%H:%M:%S+08:00")))))
        if interval > self.days:
            print('过时')
            return
        # 获取标题
        title = soup.find('div', attrs={'class': 'mobile_article'}).find('h1').get_text()
        # 获取文章
        article = soup.find('section', attrs={'class': 'textblock'})
        try:
            if article.find('p').get_text().strip().startswith('编者'):
                article.find('p').decompose()
        except:
            pass
        article = article.get_text().strip()
        # 获取总结
        summary = soup.find('section', attrs={'class': 'summary'}).get_text().strip()
        # 获取关键词和摘要
        keywords, abstract = _36r_keyword_abstract(article, 3, 3)
        raw_keywords = []
        for item in soup.find_all('a', attrs={'class': 'kr-tag-gray'}):
            raw_keywords.append(item.get_text())
        if len(raw_keywords) != 0:
            keywords = raw_keywords
        keywords = ' '.join(keywords)
        print('-----------------------------------------------')
        print('标题:', title)
        print('总结:', summary)
        print('关键词:', keywords)
        #print(article)
        print('摘要:', end='')
        print(abstract)
        print('url:', response.url)
        print('时间:', time.strftime("%Y-%m-%d %H:%M:%S",
                                   time.localtime(time.mktime(time.strptime(date, "%Y-%m-%dT%H:%M:%S+08:00")))))
        print('相似度', self.s.cal_similarities(article))
        print('-----------------------------------------------')

        # 封装成item
        similar_list = self.s.cal_similarities(article)
        if max(similar_list) > self.threshold:
            print('存在相似,保存入数据库')
            print(
                '-----------------------------------------------------------------------------------------------------------------')
            item = NewsItem()
            item['title'] = title.strip()
            item['url'] = response.url.strip()
            item['net_name'] = '36氪'
            item['ent_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(
                time.mktime(time.strptime(date, "%Y-%m-%dT%H:%M:%S+08:00"))))
            item['keyword'] = keywords.strip()
            item['digest'] = abstract.strip()
            item['content'] = article.strip()
            item['hot_degree'] = '0'
            item['scan_id'] = str(self.scan_id)
            return item
        else:
            print('没超过阈值,pass')
            print(
                '-----------------------------------------------------------------------------------------------------------------')
            pass
Ejemplo n.º 20
0
    def parse_content(self, response):
        soup = BeautifulSoup(response.body)
        # 获取新闻发布时间
        date = soup.select('td[class="time"]')[0].get_text().strip()
        date = time.strftime("%Y-%m-%d %H:%M:%S",
                             time.localtime(time.mktime(time.strptime(date, '%Y-%m-%d %H:%M'))))
        # 终止条件
        interval = time_cmp(float(self.scan_id), date)
        if interval > self.days:
            print('新闻过时')
            return

        # 获取新闻标题
        title = soup.select('h1[class="headTit"]')[0].get_text().strip()
        # 获取新闻导语
        leadword = soup.select('div[class="article-lead"]')[0].get_text().strip()
        # 获取新闻URL
        url = response.url
        # 获取新闻内容
        article = soup.select('div[class="lph-article-comView"]')[0]
        keywords = []
        try:
            for i in soup.find('div', attrs={'class': 'related-link clr'}).children:
                keywords.append(i.string.strip())
        except:
            pass
        # 删除模版和JS
        try:
            [s.extract() for s in article(['script', 'strong'])]
        except AttributeError:
            article = fix_content(article.get_text())  # 去除空格
        else:
            article = fix_content(article.get_text())  # 去除空格
        temp_keywords, abstract = leiphone_keyword_abstract(article, 3, 3)
        if len(keywords) == 0:
            keywords = temp_keywords
        keywords = ' '.join(keywords)
        print('-----------------------------------------------')
        print('标题:', title)
        print(leadword) #导语
        #print(article)
        print('关键词:', keywords)
        print('摘要:', end='')
        print(abstract)
        print('时间:', date)
        print('新闻URL:', url)
        print('相似度:', self.s.cal_similarities(article))
        print('-----------------------------------------------')

        # 封装成item
        similar_list = self.s.cal_similarities(article)
        if max(similar_list) > self.threshold:
            print('存在相似,保存入数据库')
            print(
                '-----------------------------------------------------------------------------------------------------------------')
            item = NewsItem()
            item['ent_time'] = date
            item['title'] = title.strip()
            item['url'] = url.strip()
            item['net_name'] = '雷锋网'
            item['keyword'] = keywords.strip()
            item['digest'] = abstract.strip()
            item['content'] = article.strip()
            item['hot_degree'] = '0'
            item['scan_id'] = self.scan_id
            return item
        else:
            print('没超过阈值,pass')
            print(
                '-----------------------------------------------------------------------------------------------------------------')
            pass
Ejemplo n.º 21
0
    def parse_item(self, response):
        if response.status == 200:
            lis = response.xpath('//ul[@class="live-list"]/li')
            if lis is None or len(lis) == 0:
                item = ErrorItem()
                item['code'] = 801
                # 出错的页面
                item['url'] = response.url
                # 出错的时间
                # item['timestamp'] = time.time()
                # 出错的网站
                item['site'] = "每经网"
                # 出错的描述
                item['desc'] = '未找到html元素'
                # 代码报出的错误
                item['exception'] = ''
                return item

            try:
                riqi = response.xpath(
                    '//p[@class="live"]/span/text()').getall()  # 如:2019年05月20日
                date = ''
                for temp in riqi:
                    if "年" in temp:
                        date = temp.replace("\n",
                                            "").replace("\n\r", "").replace(
                                                "\r\n",
                                                "").replace("\r", "").strip()
                        break

                for li in lis:
                    i = NewsItem()
                    i['source'] = "nbd"

                    timeStamp = ''
                    try:
                        temp = (li.xpath(
                            './div[@class="li-title"]/p/span/text()').get())
                        # 如:17:44:42

                        temp = temp.replace("\n",
                                            "").replace("\n\r", "").replace(
                                                "\r\n",
                                                "").replace("\r", "").strip()

                        temp = date + temp  # 如:2019年05月16日 18:26:27

                        d = datetime.datetime.strptime(temp,
                                                       "%Y年%m月%d日%H:%M:%S")
                        t = d.timetuple()
                        timeStamp = time.mktime(t)
                    except Exception as e:
                        print(e)
                        i['pubDate'] = ""
                    else:
                        i['pubDate'] = timeStamp

                    i['title'] = ""
                    i['content'] = li.xpath(
                        './div[@class="li-text"]/a/text()').get()
                    # TODO
                    i['isRed'] = 0
                    yield i
            except Exception as e:
                item = ErrorItem()
                item['code'] = 802
                # 出错的页面
                item['url'] = response.url
                # 出错的时间
                # item['timestamp'] = time.time()
                # 出错的网站
                item['site'] = "每经网"
                # 出错的描述
                item['desc'] = '解析html标签错误'
                # 代码报出的错误
                item['exception'] = str(e)
                yield item
        else:
            item = ErrorItem()
            item['code'] = response.status
            # 出错的页面
            item['url'] = response.url
            # 出错的时间
            # item['timestamp'] = time.time()
            # 出错的网站
            item['site'] = "每经网"
            # 出错的描述
            item['desc'] = '响应错误'
            # 代码报出的错误
            item['exception'] = ''
            yield item
Ejemplo n.º 22
0
    def parse_item(self, response):
        if response.status == 200:
            try:
                temp = re.findall('__NEXT_DATA__.*module',
                                  response.body.decode(), re.S)[0].replace(
                                      '__NEXT_DATA__ =',
                                      '').replace('__NEXT_DATA__',
                                                  '').replace('module',
                                                              '').strip()
                data_list = json.loads('[' + temp + ']')
                # print(data_list)
            except Exception as e:
                print(e)
                item = ErrorItem()
                item['code'] = 800
                # 出错的页面
                item['url'] = response.url
                # 出错的时间
                # item['timestamp'] = time.time()
                # 出错的网站
                item['site'] = "财联社"
                # 出错的描述
                item['desc'] = '响应的json数据错误'
                # 代码报出的错误
                item['exception'] = str(e)
                yield item
            else:
                try:
                    for data in data_list[0]["props"]['initialState'][
                            'telegraph']['dataList']:
                        item = NewsItem()
                        item['source'] = 'cls'

                        item['pubDate'] = data['modified_time']
                        item['title'] = data['title']
                        #
                        if '【' in data['content'] and '】' in data['content']:
                            item['content'] = re.findall(
                                '】.*', data['content'])[0].replace("】", '')
                        else:
                            item['content'] = data['content']

                        # TODO
                        item['isRed'] = 0
                        yield item
                except Exception as e:
                    item = ErrorItem()
                    item['code'] = 902
                    # 出错的页面
                    item['url'] = response.url
                    # 出错的时间
                    # item['timestamp'] = time.time()
                    # 出错的网站
                    item['site'] = "财联社"
                    # 出错的描述
                    item['desc'] = '解析json数据错误'
                    # 代码报出的错误
                    item['exception'] = str(e)
                    yield item
        else:
            item = ErrorItem()
            item['code'] = response.status
            # 出错的页面
            item['url'] = response.url
            # 出错的时间
            # item['timestamp'] = time.time()
            # 出错的网站
            item['site'] = "财联社"
            # 出错的描述
            item['desc'] = '响应错误'
            # 代码报出的错误
            item['exception'] = ''
            yield item