Example #1
0
def newspaper_fulltext2(parser, language, url):
    '''
    This is a faster version of the function that uses some internal newspaper3k functions
    so that the lxml parse tree doesn't need to be recreated.
    Adapted from https://github.com/codelucas/newspaper/blob/master/newspaper/api.py#L71
    but modified to use an already existing lxml parser
    '''
    from newspaper.cleaners import DocumentCleaner
    from newspaper.configuration import Configuration
    from newspaper.extractors import ContentExtractor
    from newspaper.outputformatters import OutputFormatter

    config = Configuration()
    config.language = language
    config.keep_article_html = True
    extractor = ContentExtractor(config)
    document_cleaner = DocumentCleaner(config)
    output_formatter = OutputFormatter(config)
    doc = parser
    doc = document_cleaner.clean(doc)
    doc = extractor.calculate_best_node(doc)
    if doc is not None:
        doc = extractor.post_cleanup(doc)
        text, html = output_formatter.get_formatted(doc)
    else:
        text = ''
        html = ''

    return {
        'value': {
            'text': text,
            'html': html,
        },
        'pattern': 'newspaper3k',
    }
Example #2
0
def newspaper_fulltext(parser, language):
    '''
    Adapted from https://github.com/codelucas/newspaper/blob/master/newspaper/api.py#L71
    but modified to use an already existing lxml parser
    '''
    from newspaper.cleaners import DocumentCleaner
    from newspaper.configuration import Configuration
    from newspaper.extractors import ContentExtractor
    from newspaper.outputformatters import OutputFormatter

    config = Configuration()
    config.language = language
    extractor = ContentExtractor(config)
    document_cleaner = DocumentCleaner(config)
    output_formatter = OutputFormatter(config)
    doc = parser
    doc = document_cleaner.clean(doc)
    top_node = extractor.calculate_best_node(doc)
    if top_node is not None:
        top_node = extractor.post_cleanup(top_node)
        text, html = output_formatter.get_formatted(top_node)
    else:
        text = None
        html = None

    return {
        'value' : {
            'text' : text,
            'html' : html,
            },
        'pattern' : 'newspaper3k',
        }
def get_data_from_html(html):
    result = {}
    parsed_html = Parser.fromstring(html)

    config = Configuration()
    extractor = ContentExtractor(config)
    formatter = OutputFormatter(config)
    cleaner = DocumentCleaner(config)

    result['title'] = extractor.get_title(parsed_html)

    publishing_date = extractor.get_publishing_date('', parsed_html)
    if publishing_date is None:
        publishing_date = datetime.datetime.now()

    result['published_at'] = publishing_date.isoformat()

    cleaned_html = cleaner.clean(parsed_html)
    top_node = extractor.calculate_best_node(cleaned_html)
    top_node = extractor.post_cleanup(top_node)
    result['content'], _ = formatter.get_formatted(top_node)

    return result
Example #4
0
class ArticleExtractionPipeline(object):
    def __init__(self):
        self.config = Configuration(
        )  # sets meta config for article and parser
        self.parser = self.config.get_parser()  # parser
        self.extractor = ContentExtractor(
            self.config
        )  # extracts info (author, tags, text, etc.) from parsed article
        self.doc_cleaner = DocumentCleaner(
            self.config)  # cleans unwanted tags and nodes from DOM
        self.formatter = OutputFormatter(
            self.config)  # outputs formatted text from parsed xpath nodes

    # right now basically only works for RT
    # params: doc is parsed html from self.parser
    def find_date_from_html(self, doc):
        # https://github.com/Webhose/article-date-extractor/blob/master/articleDateExtractor/__init__.py
        candidates = self.parser.getElementsByTag(doc, tag="time")  # add more
        times = []
        for candidate in candidates:
            time_string = candidate.text
            for indicator in ["Edited", "Updated", "Published"]:
                if indicator in time_string:
                    # indicator probably followed by "at" or ":", actual time is after that
                    if "at" in time_string:
                        time_string = time_string.split("at", 1)[1]
                    elif ":" in time_string:
                        time_string = time_string.split(":", 1)[1]
                    break
            time = self.datetime_from_str(time_string)
            if time:
                times.append(time)
        if times:
            return min(times)
        else:
            return None

    def datetime_from_str(self, datetime_string):
        try:
            return date_parser.parse(datetime_string).replace(
                tzinfo=None
            )  # otherwise can't compare naive and (timezone) offset-aware times
        except (ValueError, OverflowError, AttributeError, TypeError):
            return None

    # params: doc is parsed html from self.parser
    # TODO: generalize
    def get_date(self, url, doc):
        raw_date = (
            self.extractor.get_publishing_date(url, doc)
            or  # telesur, africanews
            self.extractor.get_meta_content(doc,
                                            "meta[name='LastModifiedDate']")
            or  # aljazeera, Sun, 07 January 2018 18:36:49 GMT
            self.extractor.get_meta_content(doc, "meta[name='Last-Modified']")
            or  # times of india, Jan 9, 2018, 05:18 IST
            self.extractor.get_meta_content(
                doc, "meta[property='og:updated_time']")
        )  # diplomat, "2018-01-05 23:22:46"
        if raw_date:
            return self.datetime_from_str(raw_date)
        else:
            return self.find_date_from_html(doc)

    # params: date is datetime object
    def recent_article(self, date, max_days_elapsed=3):
        return datetime.datetime.now() - date < datetime.timedelta(
            days=max_days_elapsed)

    def process_item(self, item, spider):
        doc = self.parser.fromstring(item["content"])

        item["title"] = self.extractor.get_title(doc)
        item["description"] = self.extractor.get_meta_description(doc)
        item["keywords"] = (self.extractor.get_meta_content(
            doc, "meta[name='news_keywords']")
                            or self.extractor.get_meta_keywords(doc))
        item["date"] = self.get_date(item["url"], doc)

        # drop item if no date
        if not item["date"] or not self.recent_article(
                item["date"], max_days_elapsed=7
        ):  # or not self.recent_article(item["date"])
            raise DropItem("Missing or invalid date for: {}".format(
                item["title"]))

        # clean:
        clean_doc = self.doc_cleaner.clean(doc)
        top_node = self.extractor.post_cleanup(
            self.extractor.calculate_best_node(clean_doc))
        item["content"] = self.formatter.get_formatted(top_node)[
            0]  # [1] returns html of article

        # drop item if article too short
        if len(item["content"]) < 600:
            raise DropItem("Not enough text: {}".format(item["title"]))

        logging.info("ARTICLE TITLE: {}".format(item["title"]))
        logging.info("\t time: {}".format(item["date"]))
        return item
Example #5
0
def parse(self):
    """
    Only change get_publish_date
    """
    # logger.debug("custom parse")
    self.throw_if_not_downloaded_verbose()

    self.doc = self.config.get_parser().fromstring(self.html)
    self.clean_doc = copy.deepcopy(self.doc)

    if self.doc is None:
        # `parse` call failed, return nothing
        return

    # TODO: Fix this, sync in our fix_url() method
    parse_candidate = self.get_parse_candidate()
    self.link_hash = parse_candidate.link_hash  # MD5

    document_cleaner = DocumentCleaner(self.config)
    output_formatter = OutputFormatter(self.config)

    try:
        title, siml, h1 = self.extractor.get_title(self.clean_doc)
        self.set_title(title)
        self.weight = siml
        ltitle = len(title)
        if ltitle >= 28 or ltitle <= 6:
            self.weight += ltitle_weight * 0.05
        elif ltitle <= 11 or ltitle >= 22:
            self.weight += ltitle_weight * 0.45
        else:
            self.weight += ltitle_weight * 0.6
    except ValueError:
        logger.error("title %s is_news %s h1 %s", title, siml, h1)

    if h1:
        self.h1 = h1[:self.config.MAX_TITLE]

    authors = self.extractor.get_authors(self.clean_doc)
    self.set_authors(authors)

    self.publish_date = self.extractor.get_publishing_date(
        self.url, self.html, self.clean_doc)

    if self.publish_date is None:
        self.weight += pubtime_weight * 0.01
    elif self.publish_date.hour == 0 and self.publish_date.minute == 0 and self.publish_date.second == 0:
        self.weight += pubtime_weight * 0.35
    else:
        self.weight += pubtime_weight * 0.5

    # 只通过这些字段判断是不是新闻 太不负责任了
    # 要不手动创建白名单和黑名单
    # 简单质量高
    # if self.is_news == False:
    #     self.is_parsed = True
    #     return

    meta_lang = self.extractor.get_meta_lang(self.clean_doc)
    self.set_meta_language(meta_lang)

    if self.config.use_meta_language:
        self.extractor.update_language(self.meta_lang)
        output_formatter.update_language(self.meta_lang)

    meta_favicon = self.extractor.get_favicon(self.clean_doc)
    self.set_meta_favicon(meta_favicon)

    meta_description = \
        self.extractor.get_meta_description(self.clean_doc)
    self.set_meta_description(meta_description)

    canonical_link = self.extractor.get_canonical_link(self.url,
                                                       self.clean_doc)
    self.set_canonical_link(canonical_link)

    tags = self.extractor.extract_tags(self.clean_doc)
    self.set_tags(tags)

    meta_keywords = self.extractor.get_meta_keywords(self.clean_doc)
    self.set_meta_keywords(meta_keywords)

    meta_data = self.extractor.get_meta_data(self.clean_doc)
    self.set_meta_data(meta_data)

    # Before any computations on the body, clean DOM object
    self.doc = document_cleaner.clean(self.doc)
    # dump(self.doc)
    self.top_node = self.extractor.calculate_best_node(self.doc)
    if self.top_node is not None:
        # 作者这里没有控制是否要提取video,我们这里有两种办法,一种是直接注释掉
        # 一种是加上控制
        if self.config.fetch_videos:
            video_extractor = VideoExtractor(self.config, self.top_node)
            self.set_movies(video_extractor.get_videos())

        self.top_node = self.extractor.post_cleanup(self.top_node)
        self.clean_top_node = copy.deepcopy(self.top_node)

        text, article_html = output_formatter.get_formatted(self.top_node)
        self.set_article_html(article_html)
        self.set_text(text)

    ltext = len(self.text)
    if ltext == 0:
        self.weight += lcontent_weight * -10
    elif ltext <= 20:
        self.weight += lcontent_weight * 0.2
    elif ltext <= 50:
        self.weight += lcontent_weight * 0.3
    elif ltext <= 200:
        self.weight += lcontent_weight * 0.4
    elif ltext <= 800:
        self.weight += lcontent_weight * 0.5
    elif ltext <= 1400:
        self.weight += lcontent_weight * 0.55
    elif ltext <= 2000:
        self.weight += lcontent_weight * 0.6
    else:
        self.weight += lcontent_weight * 0.3

    logger.debug("url:{0}, weight:{1}".format(self.url, self.weight))
    if self.weight <= 0.45:
        self.is_news = False
    else:
        self.is_news = True

    if self.config.fetch_images:
        self.fetch_images()

    self.is_parsed = True
    self.release_resources()
Example #6
0
def modified_fulltext(parser, language):
    '''
    Adapted from https://github.com/codelucas/newspaper/blob/master/newspaper/api.py#L71
    but modified to use an already existing lxml parser
    '''
    from newspaper.cleaners import DocumentCleaner
    from newspaper.configuration import Configuration
    from newspaper.extractors import ContentExtractor
    from newspaper.outputformatters import OutputFormatter

    def calculate_best_node(self, doc):
        top_node = None
        cxpath_body_nodes = lxml.etree.XPath('(//pre)|(//p)|(//td)')
        #nodes_to_check = self.nodes_to_check(doc)
        starting_boost = float(1.0)
        #cnt = 0
        #i = 0
        parent_nodes = []
        nodes_with_text = []

        #for node in nodes_to_check:
        for node in cxpath_body_nodes(doc):
            text_node = self.parser.getText(node)
            word_stats = self.stopwords_class(language=self.language). \
                get_stopword_count(text_node)
            high_link_density = self.is_highlink_density(node)
            if word_stats.get_stopword_count() > 2 and not high_link_density:
                nodes_with_text.append(node)

        nodes_number = len(nodes_with_text)
        negative_scoring = 0
        bottom_negativescore_nodes = float(nodes_number) * 0.25

        #for node in nodes_with_text:
        for i,node in enumerate(nodes_with_text):
            boost_score = float(0)
            # boost
            if self.is_boostable(node):
                #if cnt >= 0:
                if i >= 0:
                    boost_score = float((1.0 / starting_boost) * 50)
                    starting_boost += 1
            # nodes_number
            if nodes_number > 15:
                if (nodes_number - i) <= bottom_negativescore_nodes:
                    booster = float(
                        bottom_negativescore_nodes - (nodes_number - i))
                    boost_score = float(-pow(booster, float(2)))
                    negscore = abs(boost_score) + negative_scoring
                    if negscore > 40:
                        boost_score = float(5)

            text_node = self.parser.getText(node)
            word_stats = self.stopwords_class(language=self.language). \
                get_stopword_count(text_node)
            upscore = int(word_stats.get_stopword_count() + boost_score)

            parent_node = self.parser.getParent(node)
            self.update_score(parent_node, upscore)
            self.update_node_count(parent_node, 1)

            if parent_node not in parent_nodes:
                parent_nodes.append(parent_node)

            # Parent of parent node
            parent_parent_node = self.parser.getParent(parent_node)
            if parent_parent_node is not None:
                self.update_node_count(parent_parent_node, 1)
                self.update_score(parent_parent_node, upscore / 2)
                if parent_parent_node not in parent_nodes:
                    parent_nodes.append(parent_parent_node)
            #cnt += 1
            #i += 1

        top_node_score = 0
        for e in parent_nodes:
            score = self.get_score(e)

            if score > top_node_score:
                top_node = e
                top_node_score = score

            if top_node is None:
                top_node = e
        return top_node

    config = Configuration()
    config.language = language
    extractor = ContentExtractor(config)
    document_cleaner = DocumentCleaner(config)
    output_formatter = OutputFormatter(config)
    doc = parser
    #doc = document_cleaner.clean(doc)
    top_node = calculate_best_node(extractor,doc)
    if top_node is not None:
        top_node = extractor.post_cleanup(top_node)
        text, html = output_formatter.get_formatted(top_node)
    else:
        text = None
        html = None

    return {
        'value' : {
            'text' : text,
            'html' : html,
            },
        'pattern' : 'modified',
        }
    def parse(self, clean_doc=True):
        """
        Extend the Original newspaper3k Article parser
        :param clean_doc:
            Controls wether to use original DocmeuntClenaer or modified

            Original cleaner:
            On some sources this prevents the text from being parsed (Special occasion, don't parse)
            However should almost always be used otherwsie bad elements might slip through
        :return:
        """

        self.throw_if_not_downloaded_verbose()

        self.doc = self.config.get_parser().fromstring(self.html)
        self.clean_doc = copy.deepcopy(self.doc)

        if self.doc is None:
            # `parse` call failed, return nothing
            return

        # TODO: Fix this, sync in our fix_url() method
        parse_candidate = self.get_parse_candidate()
        self.link_hash = parse_candidate.link_hash  # MD5

        output_formatter = OutputFormatter(self.config)

        title = self.extractor.get_title(self.clean_doc)
        self.set_title(title)

        authors = self.extractor.get_authors(self.clean_doc)
        self.set_authors(authors)

        meta_lang = self.extractor.get_meta_lang(self.clean_doc)
        self.set_meta_language(meta_lang)

        if self.config.use_meta_language:
            self.extractor.update_language(self.meta_lang)
            output_formatter.update_language(self.meta_lang)

        meta_favicon = self.extractor.get_favicon(self.clean_doc)
        self.set_meta_favicon(meta_favicon)

        meta_description = \
            self.extractor.get_meta_description(self.clean_doc)
        self.set_meta_description(meta_description)

        canonical_link = self.extractor.get_canonical_link(
            self.url, self.clean_doc)
        self.set_canonical_link(canonical_link)

        tags = self.extractor.extract_tags(self.clean_doc)
        self.set_tags(tags)

        meta_keywords = self.extractor.get_meta_keywords(self.clean_doc)
        self.set_meta_keywords(meta_keywords)

        meta_data = self.extractor.get_meta_data(self.clean_doc)
        self.set_meta_data(meta_data)

        self.publish_date = self.extractor.get_publishing_date(
            self.url, self.clean_doc)

        if clean_doc:
            document_cleaner = DocumentCleaner(self.config)
            # Before any computations on the body, clean DOM object
            self.doc = document_cleaner.clean(self.doc)
        else:
            # Use the extended cleaner that does not remove certain dom elements
            document_cleaner = Cleaner(self.config)
            # Before any computations on the body, clean DOM object
            self.doc = document_cleaner.clean(self.doc)

        self.top_node = self.extractor.calculate_best_node(self.doc)
        if self.top_node is not None:
            video_extractor = VideoExtractor(self.config, self.top_node)
            self.set_movies(video_extractor.get_videos())

            self.top_node = self.extractor.post_cleanup(self.top_node)
            self.clean_top_node = copy.deepcopy(self.top_node)

            text, article_html = output_formatter.get_formatted(self.top_node)
            self.set_article_html(article_html)
            self.set_text(text)

        self.fetch_images()

        self.is_parsed = True
        self.release_resources()