def _process_article(self, source, rsp): try: if rsp is None: return if not rsp: info = 'download error: %s' % rsp.request.url logging.error(info) raise Exception(info) if not rsp.content: info = 'rsp no content: %s' % rsp.request.url logging.error(info) raise Exception(info) data = parse_data(rsp.content) if not data.get('content'): return data['origin_url'] = rsp.url article = Article(source=source, category=source, rss_url=rsp.request.url) gen_article_data(source, article, data) try: article.save() except (ArticleExsit, NotUniqueError): info = 'duplicated: %s' % article.source_url logging.info(info) return if article.top_images: article.top_images = download_images([image['url'] for image in article.top_images], article.seq_id) if article.related_images: article.related_images = download_images([image['url'] for image in article.related_images], article.seq_id) article.attach_feature(ArticleParser(article.text).gen_content_feature()) article.set_usable() article.save() article.warm(warm_conn) except: raven_client.captureException()
def parse(self, response): item = {} item['url'] = response.url item['data'] = parse_data(response.url, response.body_as_unicode()) yield item