Example #1
0
    def crawl(self, crawl_candidate):
        article = Article()

        parse_candidate = self.get_parse_candidate(crawl_candidate)
        raw_html = self.get_html(crawl_candidate, parse_candidate)

        if raw_html is None:
            return article

        doc = self.get_document(raw_html)

        extractor = self.get_extractor()
        document_cleaner = self.get_document_cleaner()
        output_formatter = self.get_output_formatter()

        # article
        article.final_url = parse_candidate.url
        article.link_hash = parse_candidate.link_hash
        article.raw_html = raw_html
        article.doc = doc
        article.raw_doc = deepcopy(doc)
        article.title = extractor.get_title(article)
        # TODO
        # article.publish_date = config.publishDateExtractor.extract(doc)
        # article.additional_data = config.get_additionaldata_extractor.extract(doc)
        article.meta_lang = extractor.get_meta_lang(article)
        article.meta_favicon = extractor.get_favicon(article)
        article.meta_description = extractor.get_meta_description(article)
        article.meta_keywords = extractor.get_meta_keywords(article)
        article.canonical_link = extractor.get_canonical_link(article)
        article.domain = extractor.get_domain(article.final_url)
        article.tags = extractor.extract_tags(article)
        # # before we do any calcs on the body itself let's clean up the document
        article.doc = document_cleaner.clean(article)

        # big stuff
        article.top_node = extractor.calculate_best_node(article)
        if article.top_node is not None:
            # video handeling
            video_extractor = self.get_video_extractor(article)
            video_extractor.get_videos()
            # image handeling
            if self.config.enable_image_fetching:
                image_extractor = self.get_image_extractor(article)
                article.top_image = image_extractor.get_best_image(
                    article.raw_doc, article.top_node)
            # post cleanup
            article.top_node = extractor.post_cleanup(article.top_node)
            # clean_text
            article.cleaned_text = output_formatter.get_formatted_text(article)

        # cleanup tmp file
        self.relase_resources(article)

        # extract video info
        video_info = self.get_video_info_extractor(article).get_video_info()
        if video_info:
            article.additional_data['video_info'] = video_info

        return article
Example #2
0
    def crawl(self, crawl_candidate):
        article = Article()

        parse_candidate = self.get_parse_candidate(crawl_candidate)
        raw_html = self.get_html(crawl_candidate, parse_candidate)

        if raw_html is None:
            return article

        doc = self.get_document(raw_html)

        extractor = self.get_extractor()
        document_cleaner = self.get_document_cleaner()
        output_formatter = self.get_output_formatter()

        # article
        article.final_url = parse_candidate.url
        article.link_hash = parse_candidate.link_hash
        article.raw_html = raw_html
        article.doc = doc
        article.raw_doc = deepcopy(doc)
        article.title = extractor.get_title(article)
        # TODO
        # article.publish_date = config.publishDateExtractor.extract(doc)
        # article.additional_data = config.get_additionaldata_extractor.extract(doc)
        article.meta_lang = extractor.get_meta_lang(article)
        article.meta_favicon = extractor.get_favicon(article)
        article.meta_description = extractor.get_meta_description(article)
        article.meta_keywords = extractor.get_meta_keywords(article)
        article.canonical_link = extractor.get_canonical_link(article)
        article.domain = extractor.get_domain(article.final_url)
        article.tags = extractor.extract_tags(article)
        # # before we do any calcs on the body itself let's clean up the document
        article.doc = document_cleaner.clean(article)

        # big stuff
        article.top_node = extractor.calculate_best_node(article)
        if article.top_node is not None:
            # video handeling
            video_extractor = self.get_video_extractor(article)
            video_extractor.get_videos()
            # image handeling
            if self.config.enable_image_fetching:
                image_extractor = self.get_image_extractor(article)
                article.top_image = image_extractor.get_best_image(article.raw_doc, article.top_node)
            # post cleanup
            article.top_node = extractor.post_cleanup(article.top_node)
            # clean_text
            article.cleaned_text = output_formatter.get_formatted_text(article)

        # cleanup tmp file
        self.relase_resources(article)

        # extract video info
        video_info = self.get_video_info_extractor(article).get_video_info()
        if video_info:
            article.additional_data['video_info'] = video_info

        return article
Example #3
0
    def _goose_cleaned_text(cls, html, page_html):
        article = Article()
        article.raw_html = html
        article.raw_doc = page_html
        article.doc = article.raw_doc

        goose_extractor = ContentExtractor(Configuration(), article)
        goose_cleaner = DocumentCleaner(Configuration(), article)
        goose_formatter = OutputFormatter(Configuration(), article)
        # goose_image_extractor = ImageExtractor(Configuration(), article) use

        article.doc = goose_cleaner.clean()
        article.top_node = goose_extractor.calculate_best_node()

        if article.top_node is not None:
            article.top_node = goose_extractor.post_cleanup()
            article.cleaned_text = goose_formatter.get_formatted_text()

        return article.cleaned_text
Example #4
0
    def _goose_cleaned_text(cls, html, page_html):
        article = Article()
        article.raw_html = html
        article.raw_doc = page_html
        article.doc = article.raw_doc

        goose_extractor = ContentExtractor(Configuration(), article)
        goose_cleaner = DocumentCleaner(Configuration(), article)
        goose_formatter = OutputFormatter(Configuration(), article)
        # goose_image_extractor = ImageExtractor(Configuration(), article) use

        try:
            article.doc = goose_cleaner.clean()
            article.top_node = goose_extractor.calculate_best_node()
            if article.top_node is not None:
                article.top_node = goose_extractor.post_cleanup()
                article.cleaned_text = goose_formatter.get_formatted_text()
        except UnicodeDecodeError, e:
            article.top_node = None
Example #5
0
    def crawl(self, crawl_candidate):
        article = Article()

        parse_candidate = self.get_parse_candidate(crawl_candidate)
        raw_html = self.get_html(crawl_candidate, parse_candidate)

        if raw_html is None:
            return article
        charset = get_charset(raw_html)
        raw_html = raw_html.decode(charset, 'ignore')
        pattern = re.compile("[\u4e00-\u9fa5]")
        if 'GB2312 GBK GB18030'.find(charset.upper()) != -1 \
                or pattern.search(raw_html) is not None:
            self.config.stopwords_class = StopWordsChinese
            print("中文")
        raw_html = clean_tags(raw_html, ['SOHUADCODE', 'script', 'style'])
        if charset != 'utf-8':
            raw_html = replace_meta_charset(raw_html)
        raw_html = force_meta(raw_html)
        doc = self.get_document(parse_candidate.url, raw_html)
        extractor = self.get_extractor()
        document_cleaner = self.get_document_cleaner()
        output_formatter = self.get_output_formatter()

        # article
        article.final_url = parse_candidate.url
        article.link_hash = parse_candidate.link_hash
        article.raw_html = raw_html
        article.doc = doc
        article.raw_doc = deepcopy(doc)
        article.title = extractor.get_title(article)
        # TODO
        # article.publish_date = config.publishDateExtractor.extract(doc)
        # article.additional_data = config.get_additionaldata_extractor.extract(doc)
        article.meta_lang = extractor.get_meta_lang(article)
        article.meta_favicon = extractor.get_favicon(article)
        article.meta_description = extractor.get_meta_description(article)
        article.meta_keywords = extractor.get_meta_keywords(article)
        article.canonical_link = extractor.get_canonical_link(article)
        article.domain = extractor.get_domain(article.final_url)
        article.tags = extractor.extract_tags(article)
        # before we do any calcs on the body itself let's clean up the document
        article.doc = document_cleaner.clean(article)
        # import lxml.html
        # lxml.html.open_in_browser(article.doc)
        # big stuff
        article.top_node = extractor.calculate_best_node(article)
        if article.top_node is None:
            article.top_node = doc
        if article.top_node is not None:
            # video handeling
            video_extractor = self.get_video_extractor(article)
            video_extractor.get_videos()
            # image handeling
            if self.config.enable_image_fetching:
                image_extractor = self.get_image_extractor(article)
                article.top_image = image_extractor.get_best_image(
                    article.raw_doc, article.top_node)
            # post cleanup
            # article.top_node = extractor.post_cleanup(article.top_node)
            # clean_text
            article.cleaned_text = output_formatter.get_formatted_text(article)
            # import lxml.html
            # lxml.html.open_in_browser(article.top_node)
            # article.cleaned_text = self.parser.nodeToString(article.top_node)
            if article.meta_description is None:
                article.meta_description = text_content(
                    article.cleaned_text)[:150]
        # cleanup tmp file
        self.relase_resources(article)

        return article
Example #6
0
    def crawl(self, crawl_candidate):
        article = Article()

        parse_candidate = self.get_parse_candidate(crawl_candidate)
        raw_html = self.get_html(crawl_candidate, parse_candidate)

        if raw_html is None:
            return article
        charset = get_charset(raw_html)
        raw_html = raw_html.decode(charset, 'ignore')
        pattern = re.compile("[\u4e00-\u9fa5]")
        if 'GB2312 GBK GB18030'.find(charset.upper()) != -1 \
                or pattern.search(raw_html) is not None:
            self.config.stopwords_class = StopWordsChinese
            print("中文")
        raw_html = clean_tags(raw_html, ['SOHUADCODE', 'script', 'style'])
        if charset != 'utf-8':
            raw_html = replace_meta_charset(raw_html)
        raw_html = force_meta(raw_html)
        doc = self.get_document(parse_candidate.url, raw_html)
        extractor = self.get_extractor()
        document_cleaner = self.get_document_cleaner()
        output_formatter = self.get_output_formatter()

        # article
        article.final_url = parse_candidate.url
        article.link_hash = parse_candidate.link_hash
        article.raw_html = raw_html
        article.doc = doc
        article.raw_doc = deepcopy(doc)
        article.title = extractor.get_title(article)
        # TODO
        # article.publish_date = config.publishDateExtractor.extract(doc)
        # article.additional_data = config.get_additionaldata_extractor.extract(doc)
        article.meta_lang = extractor.get_meta_lang(article)
        article.meta_favicon = extractor.get_favicon(article)
        article.meta_description = extractor.get_meta_description(article)
        article.meta_keywords = extractor.get_meta_keywords(article)
        article.canonical_link = extractor.get_canonical_link(article)
        article.domain = extractor.get_domain(article.final_url)
        article.tags = extractor.extract_tags(article)
        # before we do any calcs on the body itself let's clean up the document
        article.doc = document_cleaner.clean(article)
        # import lxml.html
        # lxml.html.open_in_browser(article.doc)
        # big stuff
        article.top_node = extractor.calculate_best_node(article)
        if article.top_node is None:
            article.top_node = doc
        if article.top_node is not None:
            # video handeling
            video_extractor = self.get_video_extractor(article)
            video_extractor.get_videos()
            # image handeling
            if self.config.enable_image_fetching:
                image_extractor = self.get_image_extractor(article)
                article.top_image = image_extractor.get_best_image(
                    article.raw_doc, article.top_node)
            # post cleanup
            # article.top_node = extractor.post_cleanup(article.top_node)
            # clean_text
            article.cleaned_text = output_formatter.get_formatted_text(article)
            # import lxml.html
            # lxml.html.open_in_browser(article.top_node)
            # article.cleaned_text = self.parser.nodeToString(article.top_node)
            if article.meta_description is None:
                article.meta_description = text_content(
                    article.cleaned_text)[:150]
        # cleanup tmp file
        self.relase_resources(article)

        return article