Ejemplo n.º 1
0
def parse(self):
    """
    Only change get_publish_date
    """
    # logger.debug("custom parse")
    self.throw_if_not_downloaded_verbose()

    self.doc = self.config.get_parser().fromstring(self.html)
    self.clean_doc = copy.deepcopy(self.doc)

    if self.doc is None:
        # `parse` call failed, return nothing
        return

    # TODO: Fix this, sync in our fix_url() method
    parse_candidate = self.get_parse_candidate()
    self.link_hash = parse_candidate.link_hash  # MD5

    document_cleaner = DocumentCleaner(self.config)
    output_formatter = OutputFormatter(self.config)

    try:
        title, siml, h1 = self.extractor.get_title(self.clean_doc)
        self.set_title(title)
        self.weight = siml
        ltitle = len(title)
        if ltitle >= 28 or ltitle <= 6:
            self.weight += ltitle_weight * 0.05
        elif ltitle <= 11 or ltitle >= 22:
            self.weight += ltitle_weight * 0.45
        else:
            self.weight += ltitle_weight * 0.6
    except ValueError:
        logger.error("title %s is_news %s h1 %s", title, siml, h1)

    if h1:
        self.h1 = h1[:self.config.MAX_TITLE]

    authors = self.extractor.get_authors(self.clean_doc)
    self.set_authors(authors)

    self.publish_date = self.extractor.get_publishing_date(
        self.url, self.html, self.clean_doc)

    if self.publish_date is None:
        self.weight += pubtime_weight * 0.01
    elif self.publish_date.hour == 0 and self.publish_date.minute == 0 and self.publish_date.second == 0:
        self.weight += pubtime_weight * 0.35
    else:
        self.weight += pubtime_weight * 0.5

    # 只通过这些字段判断是不是新闻 太不负责任了
    # 要不手动创建白名单和黑名单
    # 简单质量高
    # if self.is_news == False:
    #     self.is_parsed = True
    #     return

    meta_lang = self.extractor.get_meta_lang(self.clean_doc)
    self.set_meta_language(meta_lang)

    if self.config.use_meta_language:
        self.extractor.update_language(self.meta_lang)
        output_formatter.update_language(self.meta_lang)

    meta_favicon = self.extractor.get_favicon(self.clean_doc)
    self.set_meta_favicon(meta_favicon)

    meta_description = \
        self.extractor.get_meta_description(self.clean_doc)
    self.set_meta_description(meta_description)

    canonical_link = self.extractor.get_canonical_link(self.url,
                                                       self.clean_doc)
    self.set_canonical_link(canonical_link)

    tags = self.extractor.extract_tags(self.clean_doc)
    self.set_tags(tags)

    meta_keywords = self.extractor.get_meta_keywords(self.clean_doc)
    self.set_meta_keywords(meta_keywords)

    meta_data = self.extractor.get_meta_data(self.clean_doc)
    self.set_meta_data(meta_data)

    # Before any computations on the body, clean DOM object
    self.doc = document_cleaner.clean(self.doc)
    # dump(self.doc)
    self.top_node = self.extractor.calculate_best_node(self.doc)
    if self.top_node is not None:
        # 作者这里没有控制是否要提取video,我们这里有两种办法,一种是直接注释掉
        # 一种是加上控制
        if self.config.fetch_videos:
            video_extractor = VideoExtractor(self.config, self.top_node)
            self.set_movies(video_extractor.get_videos())

        self.top_node = self.extractor.post_cleanup(self.top_node)
        self.clean_top_node = copy.deepcopy(self.top_node)

        text, article_html = output_formatter.get_formatted(self.top_node)
        self.set_article_html(article_html)
        self.set_text(text)

    ltext = len(self.text)
    if ltext == 0:
        self.weight += lcontent_weight * -10
    elif ltext <= 20:
        self.weight += lcontent_weight * 0.2
    elif ltext <= 50:
        self.weight += lcontent_weight * 0.3
    elif ltext <= 200:
        self.weight += lcontent_weight * 0.4
    elif ltext <= 800:
        self.weight += lcontent_weight * 0.5
    elif ltext <= 1400:
        self.weight += lcontent_weight * 0.55
    elif ltext <= 2000:
        self.weight += lcontent_weight * 0.6
    else:
        self.weight += lcontent_weight * 0.3

    logger.debug("url:{0}, weight:{1}".format(self.url, self.weight))
    if self.weight <= 0.45:
        self.is_news = False
    else:
        self.is_news = True

    if self.config.fetch_images:
        self.fetch_images()

    self.is_parsed = True
    self.release_resources()
    def parse(self, clean_doc=True):
        """
        Extend the Original newspaper3k Article parser
        :param clean_doc:
            Controls wether to use original DocmeuntClenaer or modified

            Original cleaner:
            On some sources this prevents the text from being parsed (Special occasion, don't parse)
            However should almost always be used otherwsie bad elements might slip through
        :return:
        """

        self.throw_if_not_downloaded_verbose()

        self.doc = self.config.get_parser().fromstring(self.html)
        self.clean_doc = copy.deepcopy(self.doc)

        if self.doc is None:
            # `parse` call failed, return nothing
            return

        # TODO: Fix this, sync in our fix_url() method
        parse_candidate = self.get_parse_candidate()
        self.link_hash = parse_candidate.link_hash  # MD5

        output_formatter = OutputFormatter(self.config)

        title = self.extractor.get_title(self.clean_doc)
        self.set_title(title)

        authors = self.extractor.get_authors(self.clean_doc)
        self.set_authors(authors)

        meta_lang = self.extractor.get_meta_lang(self.clean_doc)
        self.set_meta_language(meta_lang)

        if self.config.use_meta_language:
            self.extractor.update_language(self.meta_lang)
            output_formatter.update_language(self.meta_lang)

        meta_favicon = self.extractor.get_favicon(self.clean_doc)
        self.set_meta_favicon(meta_favicon)

        meta_description = \
            self.extractor.get_meta_description(self.clean_doc)
        self.set_meta_description(meta_description)

        canonical_link = self.extractor.get_canonical_link(
            self.url, self.clean_doc)
        self.set_canonical_link(canonical_link)

        tags = self.extractor.extract_tags(self.clean_doc)
        self.set_tags(tags)

        meta_keywords = self.extractor.get_meta_keywords(self.clean_doc)
        self.set_meta_keywords(meta_keywords)

        meta_data = self.extractor.get_meta_data(self.clean_doc)
        self.set_meta_data(meta_data)

        self.publish_date = self.extractor.get_publishing_date(
            self.url, self.clean_doc)

        if clean_doc:
            document_cleaner = DocumentCleaner(self.config)
            # Before any computations on the body, clean DOM object
            self.doc = document_cleaner.clean(self.doc)
        else:
            # Use the extended cleaner that does not remove certain dom elements
            document_cleaner = Cleaner(self.config)
            # Before any computations on the body, clean DOM object
            self.doc = document_cleaner.clean(self.doc)

        self.top_node = self.extractor.calculate_best_node(self.doc)
        if self.top_node is not None:
            video_extractor = VideoExtractor(self.config, self.top_node)
            self.set_movies(video_extractor.get_videos())

            self.top_node = self.extractor.post_cleanup(self.top_node)
            self.clean_top_node = copy.deepcopy(self.top_node)

            text, article_html = output_formatter.get_formatted(self.top_node)
            self.set_article_html(article_html)
            self.set_text(text)

        self.fetch_images()

        self.is_parsed = True
        self.release_resources()