Example #1
0
 def _process_article(self, source, rsp):
     try:
         if rsp is None:
             return
         if not rsp:
             info = 'download error: %s' % rsp.request.url
             logging.error(info)
             raise Exception(info)
         if not rsp.content:
             info = 'rsp no content: %s' % rsp.request.url
             logging.error(info)
             raise Exception(info)
         data = parse_data(rsp.content)
         if not data.get('content'):
             return
         data['origin_url'] = rsp.url
         article = Article(source=source, category=source, rss_url=rsp.request.url)
         gen_article_data(source, article, data)
         try:
             article.save()
         except (ArticleExsit, NotUniqueError):
             info = 'duplicated: %s' % article.source_url
             logging.info(info)
             return
         if article.top_images:
             article.top_images = download_images([image['url'] for image in article.top_images], article.seq_id)
         if article.related_images:
             article.related_images = download_images([image['url'] for image in article.related_images], article.seq_id)
         article.attach_feature(ArticleParser(article.text).gen_content_feature())
         article.set_usable()
         article.save()
         article.warm(warm_conn)
     except:
         raven_client.captureException()
Example #2
0
 def parse(self, response):
     item = {}
     item['url'] = response.url
     item['data'] = parse_data(response.url, response.body_as_unicode())
     yield item