Esempio n. 1
0
    def __init__(self, config, fetcher=None):
        # config
        self.config = config
        # parser
        self.parser = self.config.get_parser()

        # article
        self.article = Article()

        # init the extractor
        self.extractor = self.get_extractor()

        # init the document cleaner
        self.cleaner = self.get_cleaner()

        # init the output formatter
        self.formatter = self.get_formatter()

        # metas extractor
        self.metas_extractor = self.get_metas_extractor()

        # opengraph extractor
        self.opengraph_extractor = self.get_opengraph_extractor()

        # schema.org news article extractor
        self.schema_extractor = self.get_schema_extractor()

        # publishdate extractor
        self.publishdate_extractor = self.get_publishdate_extractor()

        # tags extractor
        self.tags_extractor = self.get_tags_extractor()

        # authors extractor
        self.authors_extractor = self.get_authors_extractor()

        # tweets extractor
        self.tweets_extractor = self.get_tweets_extractor()

        # links extractor
        self.links_extractor = self.get_links_extractor()

        # video extractor
        self.video_extractor = self.get_video_extractor()

        # title extractor
        self.title_extractor = self.get_title_extractor()

        # html fetcher
        if isinstance(fetcher, NetworkFetcher):
            self.fetcher = fetcher
        else:
            self.fetcher = NetworkFetcher(self.config)

        # image extractor
        self.image_extractor = self.get_image_extractor()

        # TODO: use the log prefix
        self.log_prefix = "crawler: "
Esempio n. 2
0
    def __init__(self, config):
        # config
        self.config = config
        # parser
        self.parser = self.config.get_parser()

        # article
        self.article = Article()

        # init the extractor
        self.extractor = self.get_extractor()

        # init the document cleaner
        self.cleaner = self.get_cleaner()

        # init the output formatter
        self.formatter = self.get_formatter()

        # metas extractor
        self.metas_extractor = self.get_metas_extractor()

        # publishdate extractor
        self.publishdate_extractor = self.get_publishdate_extractor()

        # opengraph extractor
        self.opengraph_extractor = self.get_opengraph_extractor()

        # tags extractor
        self.tags_extractor = self.get_tags_extractor()

        # authors extractor
        self.authors_extractor = self.get_authors_extractor()

        # tweets extractor
        self.tweets_extractor = self.get_tweets_extractor()

        # links extractor
        self.links_extractor = self.get_links_extractor()

        # video extractor
        self.video_extractor = self.get_video_extractor()

        # title extractor
        self.title_extractor = self.get_title_extractor()

        # html fetcher
        self.fetcher = NetworkFetcher(self.config)

        # image extractor
        self.image_extractor = self.get_image_extractor()

        # TODO : log prefix
        self.logPrefix = "crawler:"
Esempio n. 3
0
    def __init__(self, config=None):
        # Use the passed in configuration if it is of the right type, otherwise
        # use the default as a base
        if isinstance(config, Configuration):
            self.config = config
        else:
            self.config = Configuration()

        # if config was a passed in dict, parse it into the stored configuration
        if isinstance(config, dict):
            for k, v in list(config.items()):
                if hasattr(self.config, k):
                    setattr(self.config, k, v)

        # setup a single network connection
        self.fetcher = NetworkFetcher(self.config)
        self.finalizer = weakref.finalize(self, self.close)

        # we don't need to go further if image extractor or local_storage is not set
        if not self.config.local_storage_path or not self.config.enable_image_fetching:
            return

        # test if config.local_storage_path is a directory
        if not os.path.isdir(self.config.local_storage_path):
            os.makedirs(self.config.local_storage_path)

        if not os.path.isdir(self.config.local_storage_path):
            msg = (
                '{} directory does not seem to exist, you need to set this for '
                'image processing downloads').format(
                    self.config.local_storage_path)
            raise Exception(msg)

        # test to write a dummy file to the directory to check is directory is writable
        level, path = mkstemp(dir=self.config.local_storage_path)
        try:
            with os.fdopen(level, "w"):
                pass
            os.remove(path)
        except IOError:
            msg = (
                '{} directory is not writeble, you need to set this for image '
                'processing downloads').format(self.config.local_storage_path)
            raise Exception(msg)
Esempio n. 4
0
class Crawler(object):
    def __init__(self, config, fetcher=None):
        # config
        self.config = config
        # parser
        self.parser = self.config.get_parser()

        # article
        self.article = Article()

        # init the extractor
        self.extractor = self.get_extractor()

        # init the document cleaner
        self.cleaner = self.get_cleaner()

        # init the output formatter
        self.formatter = self.get_formatter()

        # metas extractor
        self.metas_extractor = self.get_metas_extractor()

        # opengraph extractor
        self.opengraph_extractor = self.get_opengraph_extractor()

        # schema.org news article extractor
        self.schema_extractor = self.get_schema_extractor()

        # publishdate extractor
        self.publishdate_extractor = self.get_publishdate_extractor()

        # tags extractor
        self.tags_extractor = self.get_tags_extractor()

        # authors extractor
        self.authors_extractor = self.get_authors_extractor()

        # tweets extractor
        self.tweets_extractor = self.get_tweets_extractor()

        # links extractor
        self.links_extractor = self.get_links_extractor()

        # video extractor
        self.video_extractor = self.get_video_extractor()

        # title extractor
        self.title_extractor = self.get_title_extractor()

        # html fetcher
        if isinstance(fetcher, NetworkFetcher):
            self.fetcher = fetcher
        else:
            self.fetcher = NetworkFetcher(self.config)

        # image extractor
        self.image_extractor = self.get_image_extractor()

        # TODO: use the log prefix
        self.log_prefix = "crawler: "

    def crawl(self, crawl_candidate):

        # parser candidate
        parse_candidate = self.get_parse_candidate(crawl_candidate)

        # raw html
        raw_html = self.get_html(crawl_candidate, parse_candidate)

        if raw_html is None:
            return self.article

        return self.process(raw_html, parse_candidate.url,
                            parse_candidate.link_hash)

    def process(self, raw_html, final_url, link_hash):

        # create document
        doc = self.get_document(raw_html)

        # article
        self.article._final_url = final_url
        self.article._link_hash = link_hash
        self.article._raw_html = raw_html
        self.article._doc = doc
        self.article._raw_doc = deepcopy(doc)

        # open graph
        self.article._opengraph = self.opengraph_extractor.extract()

        # schema.org:
        #  - (ReportageNewsArticle) https://pending.schema.org/ReportageNewsArticle
        #  - (NewsArticle) https://schema.org/NewsArticle
        #  - (Article) https://schema.org/Article
        self.article._schema = self.schema_extractor.extract()

        if not self.article._final_url:
            if "url" in self.article.opengraph:
                self.article._final_url = self.article.opengraph["url"]
            elif self.article.schema and "url" in self.article.schema:
                self.article._final_url = self.article.schema["url"]

        # meta
        metas = self.metas_extractor.extract()
        # print(metas)
        self.article._meta_lang = metas['lang']
        self.article._meta_favicon = metas['favicon']
        self.article._meta_description = metas['description']
        self.article._meta_keywords = metas['keywords']
        self.article._meta_encoding = metas['encoding']
        self.article._canonical_link = metas['canonical']
        self.article._domain = metas['domain']

        # publishdate
        self.article._publish_date = self.publishdate_extractor.extract()
        if self.article.publish_date:
            try:
                publish_datetime = dateutil.parser.parse(
                    self.article.publish_date)
                if publish_datetime.tzinfo:
                    self.article._publish_datetime_utc = publish_datetime.astimezone(
                        tzutc())
                else:
                    self.article._publish_datetime_utc = publish_datetime
            except (ValueError, OverflowError):
                self.article._publish_datetime_utc = None

        # tags
        self.article._tags = self.tags_extractor.extract()

        # authors
        self.article._authors = self.authors_extractor.extract()

        # title
        self.article._title = self.title_extractor.extract()

        # check for known node as content body
        # if we find one force the article.doc to be the found node
        # this will prevent the cleaner to remove unwanted text content
        article_body = self.extractor.get_known_article_tags()
        if article_body is not None:
            doc = article_body

        # before we do any calcs on the body itself let's clean up the document
        if not isinstance(doc, list):
            doc = [self.cleaner.clean(doc)]
        else:
            doc = [self.cleaner.clean(deepcopy(x)) for x in doc]

        # big stuff
        self.article._top_node = self.extractor.calculate_best_node(doc)

        # if we do not find an article within the discovered possible article nodes,
        # try again with the root node.
        if self.article._top_node is None:
            # try again with the root node.
            self.article._top_node = self.extractor.calculate_best_node(
                self.article._doc)
        else:
            # set the doc member to the discovered article node.
            self.article._doc = doc

        # if we have a top node
        # let's process it
        if self.article._top_node is not None:

            # article links
            self.article._links = self.links_extractor.extract()

            # tweets
            self.article._tweets = self.tweets_extractor.extract()

            # video handling
            self.article._movies = self.video_extractor.get_videos()

            # image handling
            if self.config.enable_image_fetching:
                self.get_image()

            # post cleanup
            self.article._top_node = self.extractor.post_cleanup()

            # clean_text
            self.article._cleaned_text = self.formatter.get_formatted_text()

        # cleanup tmp file
        self.release_resources()

        # return the article
        return self.article

    @staticmethod
    def get_parse_candidate(crawl_candidate):
        if crawl_candidate.raw_html:
            return RawHelper.get_parsing_candidate(crawl_candidate.url,
                                                   crawl_candidate.raw_html)
        return URLHelper.get_parsing_candidate(crawl_candidate.url)

    def get_image(self):
        doc = self.article.raw_doc
        top_node = self.article.top_node
        self.article._top_image = self.image_extractor.get_best_image(
            doc, top_node)

    def get_html(self, crawl_candidate, parsing_candidate):
        # we got a raw_tml
        # no need to fetch remote content
        if crawl_candidate.raw_html:
            return crawl_candidate.raw_html

        # fetch HTML
        response = self.fetcher.fetch_obj(parsing_candidate.url)
        if response.encoding != 'ISO-8859-1':  # requests has a good idea; use what it says
            # return response as a unicode string
            html = response.text
            self.article._meta_encoding = response.encoding
        else:
            html = response.content
            encodings = get_encodings_from_content(response.text)
            if len(encodings) > 0:
                self.article._meta_encoding = encodings[0]
                response.encoding = encodings[0]
                html = response.text
            else:
                self.article._meta_encoding = encodings
        return html

    def get_metas_extractor(self):
        return MetasExtractor(self.config, self.article)

    def get_publishdate_extractor(self):
        return PublishDateExtractor(self.config, self.article)

    def get_opengraph_extractor(self):
        return OpenGraphExtractor(self.config, self.article)

    def get_schema_extractor(self):
        return SchemaExtractor(self.config, self.article)

    def get_tags_extractor(self):
        return TagsExtractor(self.config, self.article)

    def get_authors_extractor(self):
        return AuthorsExtractor(self.config, self.article)

    def get_tweets_extractor(self):
        return TweetsExtractor(self.config, self.article)

    def get_links_extractor(self):
        return LinksExtractor(self.config, self.article)

    def get_title_extractor(self):
        return TitleExtractor(self.config, self.article)

    def get_image_extractor(self):
        return ImageExtractor(self.fetcher, self.config, self.article)

    def get_video_extractor(self):
        return VideoExtractor(self.config, self.article)

    def get_formatter(self):
        return StandardOutputFormatter(self.config, self.article)

    def get_cleaner(self):
        return StandardDocumentCleaner(self.config, self.article)

    def get_document(self, raw_html):
        doc = self.parser.fromstring(raw_html)
        return doc

    def get_extractor(self):
        return StandardContentExtractor(self.config, self.article)

    def release_resources(self):
        path = os.path.join(self.config.local_storage_path,
                            '%s_*' % self.article.link_hash)
        for fname in glob.glob(path):
            try:
                os.remove(fname)
            except OSError:
                # TODO: better log handeling
                pass
Esempio n. 5
0
class Goose(object):
    def __init__(self, config=None):
        # Use the passed in configuration if it is of the right type, otherwise
        # use the default as a base
        if isinstance(config, Configuration):
            self.config = config
        else:
            self.config = Configuration()

        # if config was a passed in dict, parse it into the stored configuration
        if isinstance(config, dict):
            for k, v in list(config.items()):
                if hasattr(self.config, k):
                    setattr(self.config, k, v)

        # setup a single network connection
        self.fetcher = NetworkFetcher(self.config)
        self.finalizer = weakref.finalize(self, self.close)

        # we don't need to go further if image extractor or local_storage is not set
        if not self.config.local_storage_path or not self.config.enable_image_fetching:
            return

        # test if config.local_storage_path is a directory
        if not os.path.isdir(self.config.local_storage_path):
            os.makedirs(self.config.local_storage_path)

        if not os.path.isdir(self.config.local_storage_path):
            msg = (
                '{} directory does not seem to exist, you need to set this for '
                'image processing downloads').format(
                    self.config.local_storage_path)
            raise Exception(msg)

        # test to write a dummy file to the directory to check is directory is writable
        level, path = mkstemp(dir=self.config.local_storage_path)
        try:
            with os.fdopen(level, "w"):
                pass
            os.remove(path)
        except IOError:
            msg = (
                '{} directory is not writeble, you need to set this for image '
                'processing downloads').format(self.config.local_storage_path)
            raise Exception(msg)

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()

    def close(self):
        ''' close the connection and any other cleanup required '''
        if self.fetcher != None:
            self.shutdown_network()
        self.finalizer.atexit = False  # turn off the garbage collection close

    def extract(self, url=None, raw_html=None):
        """
        Main method to extract an article object from a URL,
        pass in a url and get back a Article
        """
        crawl_candidate = CrawlCandidate(self.config, url, raw_html)
        return self.crawl(crawl_candidate)

    def shutdown_network(self):
        ''' ensure the connection is closed '''
        self.fetcher.close()
        self.fetcher = None

    def crawl(self, crawl_candidate):
        parsers = list(self.config.available_parsers)
        parsers.remove(self.config.parser_class)
        try:
            crawler = Crawler(self.config, self.fetcher)
            article = crawler.crawl(crawl_candidate)
        except (UnicodeDecodeError, ValueError) as ex:
            if parsers:
                self.config.parser_class = parsers[0]
                return self.crawl(crawl_candidate)
            else:
                raise ex
        return article
Esempio n. 6
0
class Goose(object):
    ''' Extract most likely article content and aditional metadata from a URL
        or previously fetched HTML document

        Args:
            config (Configuration, dict): A configuration file or dictionary \
            representation of the configuration file
        Returns:
            Goose: An instance of the goose extraction object '''
    def __init__(self, config=None):
        # Use the passed in configuration if it is of the right type, otherwise
        # use the default as a base
        if isinstance(config, Configuration):
            self.config = config
        else:
            self.config = Configuration()

        # if config was a passed in dict, parse it into the stored configuration
        if isinstance(config, dict):
            for k, v in list(config.items()):
                if hasattr(self.config, k):
                    setattr(self.config, k, v)

        # setup a single network connection
        self.fetcher = NetworkFetcher(self.config)
        self.finalizer = weakref.finalize(self, self.close)

        # we don't need to go further if image extractor or local_storage is not set
        if not self.config.local_storage_path or not self.config.enable_image_fetching:
            return

        # test if config.local_storage_path is a directory
        if not os.path.isdir(self.config.local_storage_path):
            os.makedirs(self.config.local_storage_path)

        if not os.path.isdir(self.config.local_storage_path):
            msg = (
                '{} directory does not seem to exist, you need to set this for '
                'image processing downloads').format(
                    self.config.local_storage_path)
            raise Exception(msg)

        # test to write a dummy file to the directory to check is directory is writable
        level, path = mkstemp(dir=self.config.local_storage_path)
        try:
            with os.fdopen(level, "w"):
                pass
            os.remove(path)
        except IOError:
            msg = (
                '{} directory is not writeble, you need to set this for image '
                'processing downloads').format(self.config.local_storage_path)
            raise Exception(msg)

    def __enter__(self):
        ''' Setup the context manager '''
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        ''' Define what to do when the context manager exits '''
        self.close()

    def close(self):
        ''' Close the network connection and perform any other required cleanup

            Note:
                Auto closed when using goose as a context manager or when garbage collected '''
        if self.fetcher is not None:
            self.shutdown_network()
        self.finalizer.atexit = False  # turn off the garbage collection close

    def extract(self, url=None, raw_html=None):
        ''' Extract the most likely article content from the html page

            Args:
                url (str): URL to pull and parse
                raw_html (str): String representation of the HTML page
            Returns:
                Article: Representation of the article contents \
                including other parsed and extracted metadata '''
        crawl_candidate = CrawlCandidate(self.config, url, raw_html)
        return self.__crawl(crawl_candidate)

    def shutdown_network(self):
        ''' Close the network connection

            Note:
                Auto closed when using goose as a context manager or when garbage collected '''
        self.fetcher.close()
        self.fetcher = None

    def __crawl(self, crawl_candidate):
        parsers = list(self.config.available_parsers)
        parsers.remove(self.config.parser_class)
        try:
            crawler = Crawler(self.config, self.fetcher)
            article = crawler.crawl(crawl_candidate)
        except (UnicodeDecodeError, ValueError) as ex:
            if parsers:
                self.config.parser_class = parsers[0]
                return self.__crawl(crawl_candidate)
            else:
                raise ex
        return article
Esempio n. 7
0
class Crawler(object):
    def __init__(self, config, fetcher=None):
        # config
        self.config = config
        # parser
        self.parser = self.config.get_parser()

        # article
        self.article = Article()

        # init the extractor
        self.extractor = self.get_extractor()

        # init the document cleaner
        self.cleaner = self.get_cleaner()

        # init the output formatter
        self.formatter = self.get_formatter()

        # metas extractor
        self.metas_extractor = self.get_metas_extractor()

        # publishdate extractor
        self.publishdate_extractor = self.get_publishdate_extractor()

        # opengraph extractor
        self.opengraph_extractor = self.get_opengraph_extractor()

        # tags extractor
        self.tags_extractor = self.get_tags_extractor()

        # authors extractor
        self.authors_extractor = self.get_authors_extractor()

        # tweets extractor
        self.tweets_extractor = self.get_tweets_extractor()

        # links extractor
        self.links_extractor = self.get_links_extractor()

        # video extractor
        self.video_extractor = self.get_video_extractor()

        # title extractor
        self.title_extractor = self.get_title_extractor()

        # html fetcher
        if isinstance(fetcher, NetworkFetcher):
            self.fetcher = fetcher
        else:
            self.fetcher = NetworkFetcher(self.config)

        # image extractor
        self.image_extractor = self.get_image_extractor()

        # TODO: use the log prefix
        self.log_prefix = "crawler: "

    def crawl(self, crawl_candidate):

        # parser candidate
        parse_candidate = self.get_parse_candidate(crawl_candidate)

        # raw html
        raw_html = self.get_html(crawl_candidate, parse_candidate)

        if raw_html is None:
            return self.article

        return self.process(raw_html, parse_candidate.url,
                            parse_candidate.link_hash)

    def process(self, raw_html, final_url, link_hash):

        # create document
        doc = self.get_document(raw_html)

        # article
        self.article.final_url = final_url
        self.article.link_hash = link_hash
        self.article.raw_html = raw_html
        self.article.doc = doc
        self.article.raw_doc = deepcopy(doc)

        # open graph
        self.article.opengraph = self.opengraph_extractor.extract()

        # publishdate
        self.article.publish_date = self.publishdate_extractor.extract()

        # meta
        metas = self.metas_extractor.extract()
        # print(metas)
        self.article.meta_lang = metas['lang']
        self.article.meta_favicon = metas['favicon']
        self.article.meta_description = metas['description']
        self.article.meta_keywords = metas['keywords']
        self.article.canonical_link = metas['canonical']
        self.article.domain = metas['domain']

        # tags
        self.article.tags = self.tags_extractor.extract()

        # authors
        self.article.authors = self.authors_extractor.extract()

        # title
        self.article.title = self.title_extractor.extract()

        # check for known node as content body
        # if we find one force the article.doc to be the found node
        # this will prevent the cleaner to remove unwanted text content
        article_body = self.extractor.get_known_article_tags()
        if article_body is not None:
            self.article.doc = article_body

        # before we do any calcs on the body itself let's clean up the document
        if not isinstance(self.article.doc, list):
            self.article.doc = [self.cleaner.clean(self.article.doc)]
        else:
            self.article.doc = [
                self.cleaner.clean(deepcopy(x)) for x in self.article.doc
            ]

        # big stuff
        self.article.top_node = self.extractor.calculate_best_node()

        # if we have a top node
        # let's process it
        if self.article.top_node is not None:

            # article links
            self.article.links = self.links_extractor.extract()

            # tweets
            self.article.tweets = self.tweets_extractor.extract()

            # video handling
            self.video_extractor.get_videos()

            # image handling
            if self.config.enable_image_fetching:
                self.get_image()

            # post cleanup
            self.article.top_node = self.extractor.post_cleanup()

            # clean_text
            self.article.cleaned_text = self.formatter.get_formatted_text()

        # cleanup tmp file
        self.release_resources()

        # return the article
        return self.article

    @staticmethod
    def get_parse_candidate(crawl_candidate):
        if crawl_candidate.raw_html:
            return RawHelper.get_parsing_candidate(crawl_candidate.url,
                                                   crawl_candidate.raw_html)
        return URLHelper.get_parsing_candidate(crawl_candidate.url)

    def get_image(self):
        doc = self.article.raw_doc
        top_node = self.article.top_node
        self.article.top_image = self.image_extractor.get_best_image(
            doc, top_node)

    def get_html(self, crawl_candidate, parsing_candidate):
        # we got a raw_tml
        # no need to fetch remote content
        if crawl_candidate.raw_html:
            return crawl_candidate.raw_html

        # fetch HTML
        html = self.fetcher.fetch(parsing_candidate.url)
        return html

    def get_metas_extractor(self):
        return MetasExtractor(self.config, self.article)

    def get_publishdate_extractor(self):
        return PublishDateExtractor(self.config, self.article)

    def get_opengraph_extractor(self):
        return OpenGraphExtractor(self.config, self.article)

    def get_tags_extractor(self):
        return TagsExtractor(self.config, self.article)

    def get_authors_extractor(self):
        return AuthorsExtractor(self.config, self.article)

    def get_tweets_extractor(self):
        return TweetsExtractor(self.config, self.article)

    def get_links_extractor(self):
        return LinksExtractor(self.config, self.article)

    def get_title_extractor(self):
        return TitleExtractor(self.config, self.article)

    def get_image_extractor(self):
        return ImageExtractor(self.fetcher, self.config, self.article)

    def get_video_extractor(self):
        return VideoExtractor(self.config, self.article)

    def get_formatter(self):
        return StandardOutputFormatter(self.config, self.article)

    def get_cleaner(self):
        return StandardDocumentCleaner(self.config, self.article)

    def get_document(self, raw_html):
        doc = self.parser.fromstring(raw_html)
        return doc

    def get_extractor(self):
        return StandardContentExtractor(self.config, self.article)

    def release_resources(self):
        path = os.path.join(self.config.local_storage_path,
                            '%s_*' % self.article.link_hash)
        for fname in glob.glob(path):
            try:
                os.remove(fname)
            except OSError:
                # TODO: better log handeling
                pass
Esempio n. 8
0
class Crawler(object):
    def __init__(self, config, fetcher=None):
        # config
        self.config = config
        # parser
        self.parser = self.config.get_parser()

        # article
        self.article = Article()

        # init the extractor
        self.extractor = self.get_extractor()

        # init the document cleaner
        self.cleaner = self.get_cleaner()

        # init the output formatter
        self.formatter = self.get_formatter()

        # metas extractor
        self.metas_extractor = self.get_metas_extractor()

        # opengraph extractor
        self.opengraph_extractor = self.get_opengraph_extractor()

        # schema.org news article extractor
        self.schema_extractor = self.get_schema_extractor()

        # publishdate extractor
        self.publishdate_extractor = self.get_publishdate_extractor()

        # tags extractor
        self.tags_extractor = self.get_tags_extractor()

        # authors extractor
        self.authors_extractor = self.get_authors_extractor()

        # tweets extractor
        self.tweets_extractor = self.get_tweets_extractor()

        # links extractor
        self.links_extractor = self.get_links_extractor()

        # video extractor
        self.video_extractor = self.get_video_extractor()

        # title extractor
        self.title_extractor = self.get_title_extractor()

        # html fetcher
        if isinstance(fetcher, NetworkFetcher):
            self.fetcher = fetcher
        else:
            self.fetcher = NetworkFetcher(self.config)

        # image extractor
        self.image_extractor = self.get_image_extractor()

        # microdata extractor
        self.microdata_extractor = self.get_microdata_extractor();

        # hCard extractor
        self.hcard_extractor = self.get_hcard_extractor();

        # TODO: use the log prefix
        self.log_prefix = "crawler: "

    def crawl(self, crawl_candidate, crawl_sub=True):

        # parser candidate
        parse_candidate = self.get_parse_candidate(crawl_candidate)
        doc = None
        if crawl_candidate.doc is None:
            # raw html
            raw_html = self.get_html(crawl_candidate, parse_candidate)

            if raw_html is None:
                return self.article
        else:
            doc = crawl_candidate.doc
            raw_html = None
        return self.process(
            raw_html, parse_candidate.url, parse_candidate.link_hash, doc, crawl_sub)

    def process(self, raw_html, final_url, link_hash, doc=None, crawl_sub=False):

        # create document
        if doc is None:
            doc = self.get_document(raw_html)

        # article
        self.article._final_url = final_url
        self.article.site_domain =  goose3.text.get_site_domain(final_url)
        self.article._link_hash = link_hash
        self.article._raw_html = raw_html
        self.article.doc = doc
        self.article._raw_doc = deepcopy(doc)

        # open graph
        self.article._opengraph = self.opengraph_extractor.extract()

        # schema.org:
        #  - (ReportageNewsArticle) https://pending.schema.org/ReportageNewsArticle
        #  - (NewsArticle) https://schema.org/NewsArticle
        #  - (Article) https://schema.org/Article
        self.article._schema = self.schema_extractor.extract()

        if not self.article._final_url:
            if "url" in self.article.opengraph:
                self.article._final_url = self.article.opengraph["url"]
            elif self.article.schema and "url" in self.article.schema:
                self.article._final_url = self.article.schema["url"]

        # meta
        metas = self.metas_extractor.extract()
        # print(metas)
        self.article._meta_lang = metas['lang']
        self.article._meta_favicon = metas['favicon']
        self.article._meta_description = metas['description']
        self.article._meta_keywords = metas['keywords']
        self.article._meta_encoding = metas['encoding']
        self.article._canonical_link = metas['canonical']
        self.article._domain = metas['domain']
        self.article.metatags = metas['metatags']

        # publishdate
        self.article._publish_date = self.publishdate_extractor.extract()
        if self.article.publish_date:
            try:
                publish_datetime = dateutil.parser.parse(self.article.publish_date)
                if publish_datetime.tzinfo:
                    self.article._publish_datetime_utc = publish_datetime.astimezone(tzutc())
                else:
                    self.article._publish_datetime_utc = publish_datetime
            except (ValueError, OverflowError):
                self.article._publish_datetime_utc = None

        # tags
        self.article._tags = self.tags_extractor.extract()

        # Parse json ld
        json_ld_tags = self.parser.xpath_re(
            self.article.doc, 'descendant::script[@type="application/ld+json"]')
        if json_ld_tags:
            json_ld_text = self.parser.getText(json_ld_tags[0])
            for i in range(2):
                try:
                    self.article.json_ld = json.loads(json_ld_text)
                except Exception as ex:
                    if i == 0:
                        json_ld_text = json_ld_text.replace('""', '", "')

        for sub_article in self.article.sub_articles:
            if sub_article.node == self.article.doc:
                continue
            self.parser.remove(sub_article.node)

        self.article.doc = self.cleaner.remove_nested_article_tags(self.article.doc)
        
        # microdata
        self.article.microdata = self.microdata_extractor.extract()

        # authors
        self.article._authors = self.authors_extractor.extract()

        # title
        self.article._title = self.title_extractor.extract()

        # hcard
        self.article.hcards = self.hcard_extractor.extract()

        self.article.read_more_url = self.links_extractor.extract_read_more()

        # check for known node as content body
        # if we find one force the article.doc to be the found node
        # this will prevent the cleaner to remove unwanted text content
        # article_body = self.extractor.get_known_article_tags()
        if crawl_sub:
            article_body = self.extractor.get_known_article_tags()
            # article_body = articles[0] if articles else None
        else:
            article_body = None
        if article_body is not None:
            doc = article_body

        # before we do any calcs on the body itself let's clean up the document
        if not isinstance(doc, list):
            doc_nodes = [self.cleaner.clean(doc)]
        else:
            doc_nodes = [self.cleaner.clean(deepcopy(x)) for x in doc]

        # big stuff
        self.article._top_node = self.extractor.calculate_best_node(doc_nodes)

        # if we do not find an article within the discovered possible article nodes,
        # try again with the root node.
        if self.article._top_node is None:
            # try again with the root node.
            self.article._top_node = self.extractor.calculate_best_node(self.article._doc)
            if self.article.top_node is None:
                self.article._top_node = self.article.doc
        else:
            # set the doc member to the discovered article node.
            # self.article._doc = doc
            self.article.doc = doc[0] if isinstance(doc, list) else doc

        # if we have a top node
        # let's process it
        if self.article._top_node is not None:

            # article links
            self.article._links = self.links_extractor.extract()
            self.article.html_links = self.links_extractor.extract_html_links()

            # tweets
            self.article._tweets = self.tweets_extractor.extract()

            # video handling
            self.article._movies = self.video_extractor.get_videos()

            # image handling
            if self.config.enable_image_fetching:
                self.get_image()

            # post cleanup
            if crawl_sub:
                self.article._top_node = self.extractor.post_cleanup()

            # clean_text
            self.article._cleaned_text = self.formatter.get_formatted_text(
                remove_fewwords=crawl_sub)

        # cleanup tmp file
        self.release_resources()
        if crawl_sub and len(self.article.sub_articles) > 1:
            active_sub_articles = []
            for i in range(len(self.article.sub_articles)):
                sub_article = self.article.sub_articles[i]
                if sub_article.node == self.article.doc:
                    continue
                crawler = Crawler(self.config)
                crawled_article = crawler.crawl(
                    CrawlCandidate(
                        self.config, final_url, raw_html=sub_article.outer_html),
                    crawl_sub=False
                )
                sub_article.crawled_article = crawled_article
                active_sub_articles.append(sub_article)

            del self.article.sub_articles[:]
            self.article.sub_articles.extend(active_sub_articles)

        if crawl_sub and self.article.sub_articles:
            self.article.sub_articles.sort(
                    key=lambda obj: -len(obj.cleaned_text))
            if not self.article.cleaned_text and \
               self.article.sub_articles[0].crawled_article:
                self.article.cleaned_text = \
                    self.article.sub_articles[0].crawled_article.cleaned_text
            if not self.article.authors:
                self.article.authors = \
                    self.article.sub_articles[0].authors
        # return the article
        return self.article

    @staticmethod
    def get_parse_candidate(crawl_candidate):
        if crawl_candidate.doc is not None:
            return SubArticle.get_parsing_candidate(crawl_candidate.doc)
        if crawl_candidate.raw_html:
            return RawHelper.get_parsing_candidate(crawl_candidate.url, crawl_candidate.raw_html)
        return URLHelper.get_parsing_candidate(crawl_candidate.url)

    def get_image(self):
        doc = self.article.raw_doc
        top_node = self.article.top_node
        self.article._top_image = self.image_extractor.get_best_image(doc, top_node)

    def get_html(self, crawl_candidate, parsing_candidate):
        # we got a raw_tml
        # no need to fetch remote content
        if crawl_candidate.raw_html:
            return crawl_candidate.raw_html

        # fetch HTML
        response = self.fetcher.fetch_obj(parsing_candidate.url)
        if response.encoding != 'ISO-8859-1':  # requests has a good idea; use what it says
            # return response as a unicode string
            html = response.text
            self.article._meta_encoding = response.encoding
        else:
            html = response.content
            encodings = get_encodings_from_content(response.text)
            if len(encodings) > 0:
                self.article._meta_encoding = encodings[0]
                response.encoding = encodings[0]
                html = response.text
            else:
                self.article._meta_encoding = encodings

        if not html:
            html = ""
        crawl_candidate.raw_html = html

        # Twitter/Facebook specific news crawling. Should be transferred to separate module.
        site_domain = goose3.text.get_site_domain(parsing_candidate.url)
        if site_domain == "twitter.com":
            doc = self.parser.fromstring(html)
            a_links = self.parser.getElementsByTag(
                doc, tag='a', attr='class', value='twitter-timeline-link')
            if a_links:
                parsing_candidate.url = self.parser.getAttribute(a_links[0], 'href')
                html = self.fetcher.fetch(parsing_candidate.url)
                crawl_candidate.raw_html = html
        elif site_domain == "www.facebook.com" and "/posts/" in parsing_candidate.url:
            html = html.replace("<!--", "")
            html = html.replace("-->", "")
            doc = self.parser.fromstring(html)
            a_links = self.parser.xpath_re(
                doc, "//*[@class='hidden_elem']/descendant::a")

            link_re = re.compile(r"https?://l\.facebook\.com/l\.php\?u=(?P<url>[^&]+)&h")
            for a_link in a_links:
                href = a_link.attrib.get('href')
                match = link_re.search(href)
                if match:
                    url = match.groupdict()["url"]
                    parsing_candidate.url = urllib.parse.unquote(url)
                    html = self.fetcher.fetch(parsing_candidate.url)
                    crawl_candidate.raw_html = html
                    break

        return html

    def get_metas_extractor(self):
        return MetasExtractor(self.config, self.article)

    def get_publishdate_extractor(self):
        return PublishDateExtractor(self.config, self.article)

    def get_opengraph_extractor(self):
        return OpenGraphExtractor(self.config, self.article)

    def get_schema_extractor(self):
        return SchemaExtractor(self.config, self.article)

    def get_tags_extractor(self):
        return TagsExtractor(self.config, self.article)

    def get_authors_extractor(self):
        return AuthorsExtractor(self.config, self.article)

    def get_tweets_extractor(self):
        return TweetsExtractor(self.config, self.article)

    def get_links_extractor(self):
        return LinksExtractor(self.config, self.article)

    def get_title_extractor(self):
        return TitleExtractor(self.config, self.article)

    def get_image_extractor(self):
        return ImageExtractor(self.fetcher, self.config, self.article)

    def get_video_extractor(self):
        return VideoExtractor(self.config, self.article)

    def get_microdata_extractor(self):
        return MicroDataExtractor(self.config, self.article)

    def get_hcard_extractor(self):
        return HCardExtractor(self.config, self.article)

    def get_formatter(self):
        return StandardOutputFormatter(self.config, self.article)

    def get_cleaner(self):
        return StandardDocumentCleaner(self.config, self.article)

    def get_document(self, raw_html):
        doc = self.parser.fromstring(raw_html)
        return doc

    def get_extractor(self):
        return StandardContentExtractor(self.config, self.article)

    def release_resources(self):
        path = os.path.join(self.config.local_storage_path, '%s_*' % self.article.link_hash)
        for fname in glob.glob(path):
            try:
                os.remove(fname)
            except OSError:
                # TODO: better log handeling
                pass
Esempio n. 9
0
class Crawler(object):
    def __init__(self, config, fetcher=None):
        # config
        self.config = config
        # parser
        self.parser = self.config.get_parser()

        # article
        self.article = Article()

        # init the extractor
        self.extractor = self.get_extractor()

        # init the document cleaner
        self.cleaner = self.get_cleaner()

        # init the output formatter
        self.formatter = self.get_formatter()

        # metas extractor
        self.metas_extractor = self.get_metas_extractor()

        # opengraph extractor
        self.opengraph_extractor = self.get_opengraph_extractor()

        # schema.org news article extractor
        self.schema_extractor = self.get_schema_extractor()

        # publishdate extractor
        self.publishdate_extractor = self.get_publishdate_extractor()

        # tags extractor
        self.tags_extractor = self.get_tags_extractor()

        # authors extractor
        self.authors_extractor = self.get_authors_extractor()

        # tweets extractor
        self.tweets_extractor = self.get_tweets_extractor()

        # links extractor
        self.links_extractor = self.get_links_extractor()

        # video extractor
        self.video_extractor = self.get_video_extractor()

        # title extractor
        self.title_extractor = self.get_title_extractor()

        # html fetcher
        if isinstance(fetcher, NetworkFetcher):
            self.fetcher = fetcher
        else:
            self.fetcher = NetworkFetcher(self.config)

        # image extractor
        self.image_extractor = self.get_image_extractor()

        # TODO: use the log prefix
        self.log_prefix = "crawler: "

    def crawl(self, crawl_candidate):

        # parser candidate
        parse_candidate = self.get_parse_candidate(crawl_candidate)

        # raw html
        raw_html = self.get_html(crawl_candidate, parse_candidate)

        if raw_html is None:
            return self.article

        return self.process(raw_html, parse_candidate.url,
                            parse_candidate.link_hash)

    def process(self, raw_html, final_url, link_hash):

        # create document
        doc = self.get_document(raw_html)

        # article
        self.article._final_url = final_url
        self.article._link_hash = link_hash
        self.article._raw_html = raw_html
        self.article._doc = doc
        self.article._raw_doc = deepcopy(doc)

        # open graph
        self.article._opengraph = self.opengraph_extractor.extract()

        # schema.org:
        #  - (ReportageNewsArticle) https://pending.schema.org/ReportageNewsArticle
        #  - (NewsArticle) https://schema.org/NewsArticle
        #  - (Article) https://schema.org/Article
        self.article._schema = self.schema_extractor.extract()

        if not self.article._final_url:
            if "url" in self.article.opengraph:
                self.article._final_url = self.article.opengraph["url"]
            elif self.article.schema and "url" in self.article.schema:
                self.article._final_url = self.article.schema["url"]

        # meta
        metas = self.metas_extractor.extract()
        # print(metas)
        self.article._meta_lang = metas['lang']
        self.article._meta_favicon = metas['favicon']
        self.article._meta_description = metas['description']
        self.article._meta_keywords = metas['keywords']
        self.article._meta_encoding = metas['encoding']
        self.article._canonical_link = metas['canonical']
        if 'domain' in metas:
            self.article._domain = metas['domain']
        else:
            self.article._domain = up.urlparse(self.article.final_url).netloc

        # publishdate
        if 'datePublished' in self.article.schema:
            self.article._publish_date = self.article.schema['datePublished']
        else:
            self.article._publish_date = self.publishdate_extractor.extract()

        if self.article.publish_date:
            try:
                publish_datetime = dateutil.parser.parse(
                    self.article.publish_date)
                if publish_datetime.tzinfo:
                    self.article._publish_datetime_utc = publish_datetime.astimezone(
                        tzutc())
                else:
                    self.article._publish_datetime_utc = publish_datetime
            except (ValueError, OverflowError):
                self.article._publish_datetime_utc = None

        # tags
        self.article._tags = self.tags_extractor.extract()

        # authors
        if 'author' in self.article.schema:
            if isinstance(self.article.schema['author'], list):
                self.article._authors = list(
                    map(lambda entry: entry['name'],
                        self.article.schema['author']))
            elif isinstance(self.article.schema['author'], dict):
                author_string = self.article.schema['author'][
                    'name'] if 'name' in self.article.schema['author'] else ''
                if not author_string:
                    if 'publisher' in self.article.schema:
                        if 'name' in self.article.schema['publisher']:
                            self.article._authors = [
                                self.article.schema['publisher']['name']
                            ]
                    self.article._authors = []
                elif ',' in author_string:
                    self.article._authors = list(
                        map(str.strip, author_string.split(',')))
                elif ' und ' in author_string:
                    self.article._authors = list(
                        map(str.strip, author_string.split(' und ')))
                elif ' and ' in author_string:
                    self.article._authors = list(
                        map(str.strip, author_string.split(' and ')))
                else:
                    self.article._authors = [author_string]
        else:
            self.article._authors = self.authors_extractor.extract()

        self.article._authors = list(map(str.title, self.article.authors))

        # title
        self.article._title = self.title_extractor.extract()

        # check for known node as content body
        # if we find one force the article.doc to be the found node
        # this will prevent the cleaner to remove unwanted text content
        if 'articleBody' in self.article.schema:
            self.article._cleaned_text = self.clean_plain_text(
                self.article.schema['articleBody'])
        elif 'articleBody' in metas:
            self.article._cleaned_text = self.clean_plain_text(
                metas['articleBody'])
        elif 'articleBody' in self.article.opengraph:
            self.article._cleaned_text = self.clean_plain_text(
                self.article.opengraph['articleBody'])
        else:
            (article_body,
             domain_match) = self.extractor.get_known_article_tags()
            if article_body is not None:
                doc = article_body

            # before we do any calcs on the body itself let's clean up the document
            if not isinstance(doc, list):
                doc = [self.cleaner.clean(doc)]
            else:
                doc = [self.cleaner.clean(deepcopy(x)) for x in doc]

            # get the full text content and set cleaned_text as a fallback
            self.article._cleaned_text = " ".join(
                self.extractor.get_full_text(doc))

            # otherwise compute the best node
            self.article._top_node = self.extractor.calculate_best_node(
                doc, domain_match)

            # if we do not find an article within the discovered possible article nodes,
            # try again with the root node.
            if self.article._top_node is None:
                # try again with the root node.
                self.article._top_node = self.extractor.calculate_best_node(
                    self.article._doc, domain_match)
            else:
                # set the doc member to the discovered article node.
                self.article._doc = doc

            # if we have a top node
            # let's process it
            if self.article._top_node is not None:

                # article links
                self.article._links = self.links_extractor.extract()

                # tweets
                self.article._tweets = self.tweets_extractor.extract()

                # video handling
                self.article._movies = self.video_extractor.get_videos()

                # image handling
                if self.config.enable_image_fetching:
                    self.get_image()

                # post cleanup
                self.article._top_node = self.extractor.post_cleanup()

                # clean_text
                self.article._cleaned_text = self.clean_plain_text(
                    self.formatter.get_formatted_text())

        # check for image in linked data
        if self.config.enable_image_fetching:
            if 'image' in self.article.schema:
                self.article._top_image = self.get_image_extractor().get_image(
                    self.article.schema['image']['url'],
                    extraction_type="Linked Data")

        if self.article.cleaned_text and self.article.cleaned_text != '':
            self.article._meta_lang = langdetect.detect(
                self.article.cleaned_text)

        # cleanup tmp file
        self.release_resources()

        # return the article
        return self.article

    @staticmethod
    def get_parse_candidate(crawl_candidate):
        if crawl_candidate.raw_html:
            return RawHelper.get_parsing_candidate(crawl_candidate.url,
                                                   crawl_candidate.raw_html)
        return URLHelper.get_parsing_candidate(crawl_candidate.url)

    def clean_plain_text(self, raw_text):
        # strip html tags
        res = re.sub("<.*?>", "", raw_text)
        # replace strange spaces
        res = unicodedata.normalize("NFKC", res)
        # replace soft hyphens
        res = re.sub('[\xc2\xad]', '', res)
        # replace french quotation marks
        res = re.sub("[\u00BB\u00AB\u201C\u201D\u201E]", '"', res)
        # replace line feeds
        res = res.replace('\n', ' ')
        # shrink multiple spaces to one
        res = re.sub("\s+", " ", res)
        return res

    def get_image(self):
        doc = self.article.raw_doc
        top_node = self.article.top_node
        self.article._top_image = self.image_extractor.get_best_image(
            doc, top_node)

    def get_html(self, crawl_candidate, parsing_candidate):
        # we got a raw_tml
        # no need to fetch remote content
        if crawl_candidate.raw_html:
            return crawl_candidate.raw_html

        # fetch HTML
        response = self.fetcher.fetch_obj(parsing_candidate.url)
        if response.encoding != 'ISO-8859-1':  # requests has a good idea; use what it says
            # return response as a unicode string
            html = response.text
            self.article._meta_encoding = response.encoding
        else:
            html = response.content
            encodings = get_encodings_from_content(response.text)
            if len(encodings) > 0:
                self.article._meta_encoding = encodings[0]
                response.encoding = encodings[0]
                html = response.text
            else:
                self.article._meta_encoding = encodings
        return html

    def get_metas_extractor(self):
        return MetasExtractor(self.config, self.article)

    def get_publishdate_extractor(self):
        return PublishDateExtractor(self.config, self.article)

    def get_opengraph_extractor(self):
        return OpenGraphExtractor(self.config, self.article)

    def get_schema_extractor(self):
        return SchemaExtractor(self.config, self.article)

    def get_tags_extractor(self):
        return TagsExtractor(self.config, self.article)

    def get_authors_extractor(self):
        return AuthorsExtractor(self.config, self.article)

    def get_tweets_extractor(self):
        return TweetsExtractor(self.config, self.article)

    def get_links_extractor(self):
        return LinksExtractor(self.config, self.article)

    def get_title_extractor(self):
        return TitleExtractor(self.config, self.article)

    def get_image_extractor(self):
        return ImageExtractor(self.fetcher, self.config, self.article)

    def get_video_extractor(self):
        return VideoExtractor(self.config, self.article)

    def get_formatter(self):
        return StandardOutputFormatter(self.config, self.article)

    def get_cleaner(self):
        return StandardDocumentCleaner(self.config, self.article)

    def get_document(self, raw_html):
        doc = self.parser.fromstring(raw_html)
        return doc

    def get_extractor(self):
        return StandardContentExtractor(self.config, self.article)

    def release_resources(self):
        path = os.path.join(self.config.local_storage_path,
                            '%s_*' % self.article.link_hash)
        for fname in glob.glob(path):
            try:
                os.remove(fname)
            except OSError:
                # TODO: better log handeling
                pass
Esempio n. 10
0
class Crawler(object):
    def __init__(self, config: Configuration, fetcher=None):
        # config
        self.config = config
        # parser
        self.parser = self.config.get_parser()

        # article
        self.article = Article()

        # init the extractor
        self.extractor = self.get_extractor()

        # init the document cleaner
        self.cleaner = self.get_cleaner()

        # init the output formatter
        self.formatter = self.get_formatter()

        # metas extractor
        self.metas_extractor = self.get_metas_extractor()

        # opengraph extractor
        self.opengraph_extractor = self.get_opengraph_extractor()

        # schema.org news article extractor
        self.schema_extractor = self.get_schema_extractor()

        # publishdate extractor
        self.publishdate_extractor = self.get_publishdate_extractor()

        # tags extractor
        self.tags_extractor = self.get_tags_extractor()

        # authors extractor
        self.authors_extractor = self.get_authors_extractor()

        # tweets extractor
        self.tweets_extractor = self.get_tweets_extractor()

        # links extractor
        self.links_extractor = self.get_links_extractor()

        # video extractor
        self.video_extractor = self.get_video_extractor()

        # title extractor
        self.title_extractor = self.get_title_extractor()

        # html fetcher
        if isinstance(fetcher, NetworkFetcher):
            self.fetcher = fetcher
        else:
            self.fetcher = NetworkFetcher(self.config)

        # image extractor
        self.image_extractor = self.get_image_extractor()

    def crawl(self, crawl_candidate):

        # parser candidate
        parse_candidate = self.get_parse_candidate(crawl_candidate)

        # raw html
        raw_html = self.get_html(crawl_candidate, parse_candidate)

        if raw_html is None:
            logger.warning(
                "No raw_html is provided or could be fetched; continuing with an empty Article object"
            )
            return self.article

        return self.process(raw_html, parse_candidate.url,
                            parse_candidate.link_hash)

    def process(self, raw_html: str, final_url: str,
                link_hash: str) -> Article:

        # create document
        doc = self.get_document(raw_html)

        # article
        self.article._final_url = final_url
        self.article._link_hash = link_hash
        self.article._raw_html = raw_html
        self.article._doc = doc
        self.article._raw_doc = deepcopy(doc)

        # open graph
        self.article._opengraph = self.opengraph_extractor.extract()

        # schema.org:
        #  - (ReportageNewsArticle) https://pending.schema.org/ReportageNewsArticle
        #  - (NewsArticle) https://schema.org/NewsArticle
        #  - (Article) https://schema.org/Article
        self.article._schema = self.schema_extractor.extract()

        if not self.article._final_url:
            if "url" in self.article.opengraph:
                self.article._final_url = self.article.opengraph["url"]
            elif self.article.schema and "url" in self.article.schema:
                self.article._final_url = self.article.schema["url"]

        # meta
        metas = self.metas_extractor.extract()
        # print(metas)
        self.article._meta_lang = metas['lang']
        self.article._meta_favicon = metas['favicon']
        self.article._meta_description = metas['description']
        self.article._meta_keywords = metas['keywords']
        self.article._meta_encoding = metas['encoding']
        self.article._canonical_link = metas['canonical']
        self.article._domain = metas['domain']

        # publishdate
        self.article._publish_date = self.publishdate_extractor.extract()
        self.article._publish_datetime_utc = self._publish_date_to_utc(
        ) if self.article.publish_date else None

        # tags
        self.article._tags = self.tags_extractor.extract()

        # authors
        self.article._authors = self.authors_extractor.extract()

        # title
        self.article._title = self.title_extractor.extract()

        # jump through some hoops on attempting to get a language if not found
        if self.article._meta_lang is None:
            self.article._meta_lang = self._alternative_language_extractor()

        # check for known node as content body
        # if we find one force the article.doc to be the found node
        # this will prevent the cleaner to remove unwanted text content
        article_body = self.extractor.get_known_article_tags()
        if article_body is not None:
            doc = article_body

        # before we do any calcs on the body itself let's clean up the document
        if not isinstance(doc, list):
            doc = [self.cleaner.clean(doc)]
        else:
            doc = [self.cleaner.clean(deepcopy(x)) for x in doc]

        # big stuff
        self.article._top_node = self.extractor.calculate_best_node(doc)

        # if we do not find an article within the discovered possible article nodes,
        # try again with the root node.
        if self.article._top_node is None:
            # try again with the root node.
            self.article._top_node = self.extractor.calculate_best_node(
                self.article._doc)
        else:
            # set the doc member to the discovered article node.
            self.article._doc = doc

        # if we have a top node
        # let's process it
        if self.article._top_node is not None:

            # article links
            self.article._links = self.links_extractor.extract()

            # tweets
            self.article._tweets = self.tweets_extractor.extract()

            # video handling
            self.article._movies = self.video_extractor.get_videos()

            # image handling
            if self.config.enable_image_fetching:
                self.get_image()

            # post cleanup
            self.article._top_node = self.extractor.post_cleanup()

            # clean_text
            self.article._cleaned_text = self.formatter.get_formatted_text()

        # cleanup tmp file
        self.release_resources()

        # return the article
        return self.article

    @staticmethod
    def get_parse_candidate(
            crawl_candidate: CrawlCandidate) -> ParsingCandidate:
        if crawl_candidate.raw_html:
            return RawHelper.get_parsing_candidate(crawl_candidate.url,
                                                   crawl_candidate.raw_html)
        return URLHelper.get_parsing_candidate(crawl_candidate.url)

    def get_image(self):
        doc = self.article.raw_doc
        top_node = self.article.top_node
        self.article._top_image = self.image_extractor.get_best_image(
            doc, top_node)

    def get_html(self, crawl_candidate: CrawlCandidate,
                 parsing_candidate: ParsingCandidate) -> str:
        # we got a raw_tml
        # no need to fetch remote content
        if crawl_candidate.raw_html:
            logger.debug(f"Using raw_html for {crawl_candidate}")
            return crawl_candidate.raw_html

        # fetch HTML
        logger.debug(f"Fetching html from {crawl_candidate.url}")
        response = self.fetcher.fetch_obj(parsing_candidate.url)
        if response.encoding != 'ISO-8859-1':  # requests has a good idea; use what it says
            # return response as a unicode string
            html = response.text
            self.article._meta_encoding = response.encoding
        else:
            html = response.content
            encodings = get_encodings_from_content(response.text)
            if len(encodings) > 0:
                self.article._meta_encoding = encodings[0]
                response.encoding = encodings[0]
                html = response.text
            else:
                self.article._meta_encoding = encodings
        return html

    def get_metas_extractor(self):
        return MetasExtractor(self.config, self.article)

    def get_publishdate_extractor(self):
        return PublishDateExtractor(self.config, self.article)

    def get_opengraph_extractor(self):
        return OpenGraphExtractor(self.config, self.article)

    def get_schema_extractor(self):
        return SchemaExtractor(self.config, self.article)

    def get_tags_extractor(self):
        return TagsExtractor(self.config, self.article)

    def get_authors_extractor(self):
        return AuthorsExtractor(self.config, self.article)

    def get_tweets_extractor(self):
        return TweetsExtractor(self.config, self.article)

    def get_links_extractor(self):
        return LinksExtractor(self.config, self.article)

    def get_title_extractor(self):
        return TitleExtractor(self.config, self.article)

    def get_image_extractor(self):
        return ImageExtractor(self.fetcher, self.config, self.article)

    def get_video_extractor(self):
        return VideoExtractor(self.config, self.article)

    def get_formatter(self):
        return StandardOutputFormatter(self.config, self.article)

    def get_cleaner(self):
        return StandardDocumentCleaner(self.config, self.article)

    def get_document(self, raw_html):
        doc = self.parser.fromstring(raw_html)
        return doc

    def get_extractor(self):
        return StandardContentExtractor(self.config, self.article)

    def release_resources(self):
        path = os.path.join(self.config.local_storage_path,
                            '%s_*' % self.article.link_hash)
        for fname in glob.glob(path):
            try:
                os.remove(fname)
            except OSError:
                logger.error(f"File {fname} could not be removed")

    def _publish_date_to_utc(self):
        try:
            publish_datetime = dateutil.parser.parse(self.article.publish_date,
                                                     tzinfos=TIMEZONE_INFO)
            if publish_datetime.tzinfo:
                return publish_datetime.astimezone(tzutc())
            else:
                return publish_datetime
        except (ValueError, OverflowError):
            logger.warning(
                f"Publish date {self.article.publish_date} could not be resolved to UTC"
            )
            return None

    def _alternative_language_extractor(self):
        tmp_lang_detect = "{} {} {} {}".format(self.article._meta_description,
                                               self.article._title,
                                               self.article._meta_keywords,
                                               self.article._tags)
        tmp_lang_detect = " ".join(tmp_lang_detect.split())
        if len(tmp_lang_detect) > 15:
            # required to make it deterministic;
            # see: https://github.com/Mimino666/langdetect/blob/master/README.md#basic-usage
            DetectorFactory.seed = 0
            try:
                return detect(tmp_lang_detect)
            except LangDetectException:
                logger.warning(
                    "Alternative language extractor failed to extract a known language"
                )
                return None