def parse_news(self, response): item = SimilarityItem() article = Article(url='', config=config) article.download(input_html=response) article.parse() if article.title and article.title != '': item['title'] = article.title item['ltitle'] = len(article.title) else: item['title'] = "N/A" item['ltitle'] = "N/A" if article.h1 and article.h1 != '': item['h1'] = article.h1 else: item['h1'] = "N/A" if isinstance(article.publish_date, datetime): item['pubtime'] = article.publish_date.strftime( "%Y-%m-%d %H:%M:%S") else: item['pubtime'] = "N/A" if article.text and article.text != '': item['content'] = article.text item['lcontent'] = len(article.text) else: item['content'] = "N/A" item['lcontent'] = "N/A" item['url'] = response.url def foo(a, b): if a and b: return difflib.SequenceMatcher(None, a, b).quick_ratio() else: logger.error("a:{0} b:{1}".format(a, b)) item['h1_title_siml'] = foo(article.h1, article.title) item['_id'] = hashlib.md5(response.url.encode('utf-8')).hexdigest() item['weight'] = article.weight item['judge'] = 2 item['is_news'] = article.get_is_news() yield item