Exemple #1
0
    def parse_news(self, response):
        item = SimilarityItem()
        article = Article(url='', config=config)
        article.download(input_html=response)
        article.parse()
        if article.title and article.title != '':
            item['title'] = article.title
            item['ltitle'] = len(article.title)
        else:
            item['title'] = "N/A"
            item['ltitle'] = "N/A"
        if article.h1 and article.h1 != '':
            item['h1'] = article.h1
        else:
            item['h1'] = "N/A"
        if isinstance(article.publish_date, datetime):
            item['pubtime'] = article.publish_date.strftime(
                "%Y-%m-%d %H:%M:%S")
        else:
            item['pubtime'] = "N/A"
        if article.text and article.text != '':
            item['content'] = article.text
            item['lcontent'] = len(article.text)
        else:
            item['content'] = "N/A"
            item['lcontent'] = "N/A"
        item['url'] = response.url

        def foo(a, b):
            if a and b:
                return difflib.SequenceMatcher(None, a, b).quick_ratio()
            else:
                logger.error("a:{0} b:{1}".format(a, b))

        item['h1_title_siml'] = foo(article.h1, article.title)
        item['_id'] = hashlib.md5(response.url.encode('utf-8')).hexdigest()
        item['weight'] = article.weight
        item['judge'] = 2
        item['is_news'] = article.get_is_news()
        yield item