Example #1
0
    def __init__(self, config, fetcher=None):
        # config
        self.config = config
        # parser
        self.parser = self.config.get_parser()

        # catalogue
        self.catalogue = Catalogue()

        # metas extractor
        self.metas_extractor = self.get_metas_extractor()

        # html fetcher
        if isinstance(fetcher, NetworkFetcher):
            self.fetcher = fetcher
        else:
            self.fetcher = NetworkFetcher(self.config)

        # TODO: use the log prefix
        self.log_prefix = "urlcrawler: "

        # metas extractor
        self.metas_extractor = self.get_metas_extractor()

        self.extractor = self.get_extractor()
Example #2
0
    def __init__(self, config=None):
        # Use the passed in configuration if it is of the right type, otherwise
        # use the default as a base
        if isinstance(config, Configuration):
            self.config = config
        else:
            self.config = Configuration()

        # if config was a passed in dict, parse it into the stored configuration
        if isinstance(config, dict):
            for k, v in list(config.items()):
                if hasattr(self.config, k):
                    setattr(self.config, k, v)

        # setup a single network connection
        self.fetcher = NetworkFetcher(self.config)
        self.finalizer = weakref.finalize(self, self.close)

        # we don't need to go further if image extractor or local_storage is not set
        if not self.config.local_storage_path or not self.config.enable_image_fetching:
            return

        # test if config.local_storage_path is a directory
        if not os.path.isdir(self.config.local_storage_path):
            os.makedirs(self.config.local_storage_path)

        if not os.path.isdir(self.config.local_storage_path):
            msg = (
                '{} directory does not seem to exist, you need to set this for '
                'image processing downloads').format(
                    self.config.local_storage_path)
            raise Exception(msg)

        # test to write a dummy file to the directory to check is directory is writable
        level, path = mkstemp(dir=self.config.local_storage_path)
        try:
            with os.fdopen(level, "w"):
                pass
            os.remove(path)
        except IOError:
            msg = (
                '{} directory is not writeble, you need to set this for image '
                'processing downloads').format(self.config.local_storage_path)
            raise Exception(msg)
Example #3
0
    def __init__(self, config, fetcher=None):
        # config
        self.config = config
        # parser
        self.parser = self.config.get_parser()

        # article
        self.article = Article()

        # init the extractor
        self.extractor = self.get_extractor()

        # init the document cleaner
        self.cleaner = self.get_cleaner()

        # init the output formatter
        self.formatter = self.get_formatter()

        # metas extractor
        self.metas_extractor = self.get_metas_extractor()

        # opengraph extractor
        self.opengraph_extractor = self.get_opengraph_extractor()

        # reportage news article extractor
        self.reportagenewsarticle_extractor = self.get_reportagenewsarticle_extractor(
        )

        # publishdate extractor
        self.publishdate_extractor = self.get_publishdate_extractor()

        # tags extractor
        self.tags_extractor = self.get_tags_extractor()

        # authors extractor
        self.authors_extractor = self.get_authors_extractor()

        # tweets extractor
        self.tweets_extractor = self.get_tweets_extractor()

        # links extractor
        self.links_extractor = self.get_links_extractor()

        # video extractor
        self.video_extractor = self.get_video_extractor()

        # title extractor
        self.title_extractor = self.get_title_extractor()

        # html fetcher
        if isinstance(fetcher, NetworkFetcher):
            self.fetcher = fetcher
        else:
            self.fetcher = NetworkFetcher(self.config)

        # attach extractor
        self.attach_extractor = self.get_attach_extractor()

        # image extractor
        self.image_extractor = self.get_image_extractor()

        self.custom_extractor = self.get_custom_extractor()

        # TODO: use the log prefix
        self.log_prefix = "crawler: "
Example #4
0
class Crawler(object):
    def __init__(self, config, fetcher=None):
        # config
        self.config = config
        # parser
        self.parser = self.config.get_parser()

        # article
        self.article = Article()

        # init the extractor
        self.extractor = self.get_extractor()

        # init the document cleaner
        self.cleaner = self.get_cleaner()

        # init the output formatter
        self.formatter = self.get_formatter()

        # metas extractor
        self.metas_extractor = self.get_metas_extractor()

        # opengraph extractor
        self.opengraph_extractor = self.get_opengraph_extractor()

        # reportage news article extractor
        self.reportagenewsarticle_extractor = self.get_reportagenewsarticle_extractor(
        )

        # publishdate extractor
        self.publishdate_extractor = self.get_publishdate_extractor()

        # tags extractor
        self.tags_extractor = self.get_tags_extractor()

        # authors extractor
        self.authors_extractor = self.get_authors_extractor()

        # tweets extractor
        self.tweets_extractor = self.get_tweets_extractor()

        # links extractor
        self.links_extractor = self.get_links_extractor()

        # video extractor
        self.video_extractor = self.get_video_extractor()

        # title extractor
        self.title_extractor = self.get_title_extractor()

        # html fetcher
        if isinstance(fetcher, NetworkFetcher):
            self.fetcher = fetcher
        else:
            self.fetcher = NetworkFetcher(self.config)

        # attach extractor
        self.attach_extractor = self.get_attach_extractor()

        # image extractor
        self.image_extractor = self.get_image_extractor()

        self.custom_extractor = self.get_custom_extractor()

        # TODO: use the log prefix
        self.log_prefix = "crawler: "

    def crawl(self, crawl_candidate):

        # parser candidate
        parse_candidate = self.get_parse_candidate(crawl_candidate)

        # raw html
        raw_html = self.get_html(crawl_candidate, parse_candidate)

        if raw_html is None:
            return self.article

        return self.process(raw_html, parse_candidate.url,
                            parse_candidate.link_hash,
                            parse_candidate.encoding)

    def process(self, raw_html, final_url, link_hash, encoding=None):

        # create document
        doc = self.get_document(raw_html, encoding)

        # article
        self.article._final_url = final_url or self.config.final_url
        self.article._link_hash = link_hash
        self.article._raw_html = raw_html
        self.article._doc = doc
        self.article._raw_doc = deepcopy(doc)

        # open graph
        self.article._opengraph = self.opengraph_extractor.extract()

        # schema (ReportageNewsArticle) https://pending.schema.org/ReportageNewsArticle
        self.article._schema = self.reportagenewsarticle_extractor.extract()

        if not self.article._final_url:
            if "url" in self.article.opengraph:
                self.article._final_url = self.article.opengraph["url"]
            elif self.article.schema and "url" in self.article.schema:
                self.article._final_url = self.article.schema["url"]

        # meta
        metas = self.metas_extractor.extract()
        # print(metas)
        self.article._meta_lang = metas['lang']
        self.article._meta_favicon = metas['favicon']
        self.article._meta_description = metas['description']
        self.article._meta_keywords = metas['keywords']
        self.article._meta_encoding = metas['encoding']
        self.article._canonical_link = metas['canonical']
        self.article._domain = metas['domain']

        # tags
        self.article._tags = self.tags_extractor.extract()

        # authors
        self.article._authors = self.authors_extractor.extract()

        # title
        self.article._title = self.title_extractor.extract()

        self.article._attaches = self.attach_extractor.extract()

        for k in self.config.custom_rule:
            if k not in ('title', 'author', 'pubtime', 'content', 'attaches'):
                self.article.add_additional_data(
                    k, self.custom_extractor.extract(k))

        # check for known node as content body
        # if we find one force the article.doc to be the found node
        # this will prevent the cleaner to remove unwanted text content
        article_body = self.extractor.get_known_article_tags()
        if article_body is not None:
            doc = article_body

        # before we do any calcs on the body itself let's clean up the document
        if not isinstance(doc, list):
            doc = [self.cleaner.clean(doc)]
        else:
            doc = [self.cleaner.clean(deepcopy(x)) for x in doc]

        # big stuff
        self.article._top_node = self.extractor.custom_top_node(doc)

        if self.article._top_node is not None:
            self.article._doc = doc

            # publishdate
            self.article._publish_date = self.publishdate_extractor.extract()

            # article links
            self.article._links = self.links_extractor.extract()

            # tweets
            self.article._tweets = self.tweets_extractor.extract()

            # video handling
            self.article._movies = self.video_extractor.get_videos()

            # image handling
            if self.config.enable_image_fetching:
                self.get_image()

            self.article._top_node_html = self.parser.outerHtml(
                self.article._top_node)

            # clean_text
            self.article._cleaned_text = self.formatter.get_formatted_text()
        else:
            self.article._top_node = self.extractor.calculate_best_node(
                self.article._doc)

            # publishdate
            self.article._publish_date = self.publishdate_extractor.extract()

            # if we have a top node
            # let's process it
            if self.article._top_node is not None:

                # article links
                self.article._links = self.links_extractor.extract()

                # tweets
                self.article._tweets = self.tweets_extractor.extract()

                # video handling
                self.article._movies = self.video_extractor.get_videos()

                # image handling
                if self.config.enable_image_fetching:
                    self.get_image()

                # post cleanup
                self.article._top_node = self.extractor.post_cleanup()

                self.article._top_node_html = self.parser.outerHtml(
                    self.article._top_node)

                # clean_text
                self.article._cleaned_text = self.formatter.get_formatted_text(
                )

        if not self.article._cleaned_text:
            self.article._cleaned_text = self.extractor.extract()

        # cleanup tmp file
        self.release_resources()

        # return the article
        return self.article

    @staticmethod
    def get_parse_candidate(crawl_candidate):
        if crawl_candidate.raw_html:
            return RawHelper.get_parsing_candidate(crawl_candidate.url,
                                                   crawl_candidate.raw_html,
                                                   crawl_candidate.encoding)
        return URLHelper.get_parsing_candidate(crawl_candidate.url)

    def get_image(self):
        doc = self.article.raw_doc
        top_node = self.article.top_node
        self.article._top_image = self.image_extractor.get_best_image(
            doc, top_node)

    def get_html(self, crawl_candidate, parsing_candidate):
        # we got a raw_tml
        # no need to fetch remote content
        if crawl_candidate.raw_html:
            return crawl_candidate.raw_html

        # fetch HTML
        html = self.fetcher.fetch(parsing_candidate.url)
        return html

    def get_metas_extractor(self):
        return MetasExtractor(self.config, self.article)

    def get_publishdate_extractor(self):
        return PublishDateExtractor(self.config, self.article)

    def get_opengraph_extractor(self):
        return OpenGraphExtractor(self.config, self.article)

    def get_reportagenewsarticle_extractor(self):
        return ReportageNewsArticleExtractor(self.config, self.article)

    def get_tags_extractor(self):
        return TagsExtractor(self.config, self.article)

    def get_authors_extractor(self):
        return AuthorsExtractor(self.config, self.article)

    def get_attach_extractor(self):
        return AttachExtractor(self.fetcher, self.config, self.article)

    def get_tweets_extractor(self):
        return TweetsExtractor(self.config, self.article)

    def get_links_extractor(self):
        return LinksExtractor(self.config, self.article)

    def get_title_extractor(self):
        return TitleExtractor(self.config, self.article)

    def get_image_extractor(self):
        return ImageExtractor(self.fetcher, self.config, self.article)

    def get_video_extractor(self):
        return VideoExtractor(self.config, self.article)

    def get_formatter(self):
        return StandardOutputFormatter(self.config, self.article)

    def get_cleaner(self):
        return StandardDocumentCleaner(self.config, self.article)

    def get_document(self, raw_html, encoding=None):
        doc = self.parser.fromstring(raw_html, encoding)
        return doc

    def get_extractor(self):
        return StandardContentExtractor(self.config, self.article)

    def get_custom_extractor(self):
        return CustomExtractor(self.config, self.article)

    def release_resources(self):
        if not self.config.local_storage_path:
            return
        path = os.path.join(self.config.local_storage_path,
                            '%s_*' % self.article.link_hash)
        for fname in glob.glob(path):
            try:
                os.remove(fname)
            except OSError:
                # TODO: better log handeling
                pass
Example #5
0
class CustomCrawler(object):
    def __init__(self, config, fetcher=None):
        # config
        self.config = config
        # parser
        self.parser = self.config.get_parser()

        # catalogue
        self.catalogue = Catalogue()

        # html fetcher
        if isinstance(fetcher, NetworkFetcher):
            self.fetcher = fetcher
        else:
            self.fetcher = NetworkFetcher(self.config)

        # TODO: use the log prefix
        self.log_prefix = "urlcrawler: "

        self.extractor = self.get_extractor()

    def crawl(self, crawl_candidate):

        # parser candidate
        parse_candidate = self.get_parse_candidate(crawl_candidate)

        # raw html
        raw_html = self.get_html(crawl_candidate, parse_candidate)

        if raw_html is None:
            return self.catalogue

        return self.process(raw_html, parse_candidate.url,
                            parse_candidate.link_hash,
                            parse_candidate.encoding)

    def get_document(self, raw_html, encoding=None):
        doc = self.parser.fromstring(raw_html, encoding)
        return doc

    def get_html(self, crawl_candidate, parsing_candidate):
        # we got a raw_tml
        # no need to fetch remote content
        if crawl_candidate.raw_html:
            return crawl_candidate.raw_html

        # fetch HTML
        html = self.fetcher.fetch(parsing_candidate.url)
        return html

    @staticmethod
    def get_parse_candidate(crawl_candidate):
        if crawl_candidate.raw_html:
            return RawHelper.get_parsing_candidate(crawl_candidate.url,
                                                   crawl_candidate.raw_html,
                                                   crawl_candidate.encoding)
        return URLHelper.get_parsing_candidate(crawl_candidate.url)

    def get_extractor(self):
        return CustomExtractor(self.config, self.catalogue)

    def process(self, raw_html, final_url, link_hash, encoding=None):

        # create document
        doc = self.get_document(raw_html, encoding)

        # catalogue
        self.catalogue._final_url = final_url or self.config.final_url
        self.catalogue._link_hash = link_hash
        self.catalogue._raw_html = raw_html
        self.catalogue._doc = doc
        self.catalogue._raw_doc = deepcopy(doc)

        custom_rule = self.config.custom_rule
        if custom_rule:
            data = {}
            onlyOne = custom_rule.pop('onlyOne', 1)
            if 'item' in custom_rule and custom_rule['item']:
                if 'filter' in custom_rule and custom_rule['filter']:
                    doc = self.extractor.custom_match_elements(
                        custom_rule['filter'], doc=doc)
                self.catalogue._doc = doc
                for key, rule in custom_rule['item'].items():
                    parsed = self.extractor.extract(key, rule, onlyOne)
                    parsed = utils.patch_result(parsed, rule)
                    parsed = utils.extract_result(parsed, rule)
                    data[key] = [parsed
                                 ] if not isinstance(parsed, list) else parsed
                self.catalogue.data = utils.table2kvlist(data)
            else:
                for key, rule in custom_rule.items():
                    parsed = self.extractor.extract(key, rule)
                    parsed = utils.patch_result(parsed, rule)
                    parsed = utils.extract_result(parsed, rule)
                    data[key] = parsed
                self.catalogue.data = [data]
        return self.catalogue
Example #6
0
class CatalogueCrawler(object):
    def __init__(self, config, fetcher=None):
        # config
        self.config = config
        # parser
        self.parser = self.config.get_parser()

        # catalogue
        self.catalogue = Catalogue()

        # metas extractor
        self.metas_extractor = self.get_metas_extractor()

        # html fetcher
        if isinstance(fetcher, NetworkFetcher):
            self.fetcher = fetcher
        else:
            self.fetcher = NetworkFetcher(self.config)

        # TODO: use the log prefix
        self.log_prefix = "urlcrawler: "

        # metas extractor
        self.metas_extractor = self.get_metas_extractor()

        self.extractor = self.get_extractor()

    def crawl(self, crawl_candidate):

        # parser candidate
        parse_candidate = self.get_parse_candidate(crawl_candidate)

        # raw html
        raw_html = self.get_html(crawl_candidate, parse_candidate)

        if raw_html is None:
            return self.catalogue

        return self.process(raw_html, parse_candidate.url,
                            parse_candidate.link_hash,
                            parse_candidate.encoding)

    def get_document(self, raw_html, encoding=None):
        doc = self.parser.fromstring(raw_html, encoding)
        return doc

    def get_html(self, crawl_candidate, parsing_candidate):
        # we got a raw_tml
        # no need to fetch remote content
        if crawl_candidate.raw_html:
            return crawl_candidate.raw_html

        # fetch HTML
        html = self.fetcher.fetch(parsing_candidate.url)
        return html

    @staticmethod
    def get_parse_candidate(crawl_candidate):
        if crawl_candidate.raw_html:
            return RawHelper.get_parsing_candidate(crawl_candidate.url,
                                                   crawl_candidate.raw_html,
                                                   crawl_candidate.encoding)
        return URLHelper.get_parsing_candidate(crawl_candidate.url)

    def get_metas_extractor(self):
        return MetasExtractor(self.config, self.catalogue)

    def get_extractor(self):
        return CatalogueExtractor(self.config, self.catalogue)

    def process(self, raw_html, final_url, link_hash, encoding=None):

        # create document
        doc = self.get_document(raw_html, encoding)

        # catalogue
        self.catalogue._final_url = final_url or self.config.final_url
        self.catalogue._link_hash = link_hash
        self.catalogue._raw_html = raw_html
        self.catalogue._doc = doc
        self.catalogue._raw_doc = deepcopy(doc)

        metas = self.metas_extractor.extract()
        self.catalogue._meta_lang = metas['lang']
        self.catalogue._meta_favicon = metas['favicon']
        self.catalogue._meta_description = metas['description']
        self.catalogue._meta_keywords = metas['keywords']
        self.catalogue._canonical_link = metas['canonical']
        self.catalogue._domain = metas['domain']

        self.catalogue.data = self.extractor.extract()

        return self.catalogue
Example #7
0
class Goose(object):
    ''' Extract most likely article content and aditional metadata from a URL
        or previously fetched HTML document

        Args:
            config (Configuration, dict): A configuration file or dictionary \
            representation of the configuration file
        Returns:
            Goose: An instance of the goose extraction object '''
    def __init__(self, config=None):
        # Use the passed in configuration if it is of the right type, otherwise
        # use the default as a base
        if isinstance(config, Configuration):
            self.config = config
        else:
            self.config = Configuration()

        # if config was a passed in dict, parse it into the stored configuration
        if isinstance(config, dict):
            for k, v in list(config.items()):
                if hasattr(self.config, k):
                    setattr(self.config, k, v)

        # setup a single network connection
        self.fetcher = NetworkFetcher(self.config)
        self.finalizer = weakref.finalize(self, self.close)

        # we don't need to go further if image extractor or local_storage is not set
        if not self.config.local_storage_path or not self.config.enable_image_fetching:
            return

        # test if config.local_storage_path is a directory
        if not os.path.isdir(self.config.local_storage_path):
            os.makedirs(self.config.local_storage_path)

        if not os.path.isdir(self.config.local_storage_path):
            msg = (
                '{} directory does not seem to exist, you need to set this for '
                'image processing downloads').format(
                    self.config.local_storage_path)
            raise Exception(msg)

        # test to write a dummy file to the directory to check is directory is writable
        level, path = mkstemp(dir=self.config.local_storage_path)
        try:
            with os.fdopen(level, "w"):
                pass
            os.remove(path)
        except IOError:
            msg = (
                '{} directory is not writeble, you need to set this for image '
                'processing downloads').format(self.config.local_storage_path)
            raise Exception(msg)

    def __enter__(self):
        ''' Setup the context manager '''
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        ''' Define what to do when the context manager exits '''
        self.close()

    def close(self):
        ''' Close the network connection and perform any other required cleanup

            Note:
                Auto closed when using goose as a context manager or when garbage collected '''
        if self.fetcher is not None:
            self.shutdown_network()
        self.finalizer.atexit = False  # turn off the garbage collection close

    def extract(self, url=None, raw_html=None, encoding=None):
        ''' Extract the most likely article content from the html page

            Args:
                url (str): URL to pull and parse
                raw_html (str): String representation of the HTML page
            Returns:
                Article: Representation of the article contents \
                including other parsed and extracted metadata '''
        crawl_candidate = CrawlCandidate(self.config, url, raw_html, encoding)
        return self.__crawl(crawl_candidate)

    def fetch(self, url=None, raw_html=None, encoding=None):
        crawl_candidate = CrawlCandidate(self.config, url, raw_html, encoding)
        return self.__fetch(crawl_candidate)

    def parse(self, url=None, raw_html=None, encoding=None):
        crawl_candidate = CrawlCandidate(self.config, url, raw_html, encoding)
        return self.__parse(crawl_candidate)

    def shutdown_network(self):
        ''' Close the network connection

            Note:
                Auto closed when using goose as a context manager or when garbage collected '''
        self.fetcher.close()
        self.fetcher = None

    def __crawl(self, crawl_candidate):
        ''' wrap the crawling functionality '''
        def crawler_wrapper(parser, parsers_lst, crawl_candidate):
            try:
                crawler = Crawler(self.config, self.fetcher)
                article = crawler.crawl(crawl_candidate)
            except (UnicodeDecodeError, ValueError) as ex:
                if parsers_lst:
                    parser = parsers_lst.pop(0)  # remove it also!
                    return crawler_wrapper(parser, parsers_lst,
                                           crawl_candidate)
                else:
                    raise ex
            return article

        # use the wrapper
        parsers = list(self.config.available_parsers)
        parsers.remove(self.config.parser_class)
        return crawler_wrapper(self.config.parser_class, parsers,
                               crawl_candidate)

    def __fetch(self, crawl_candidate):
        def crawler_wrapper(parser, parsers_lst, crawl_candidate):
            try:
                crawler = CatalogueCrawler(self.config, self.fetcher)
                article = crawler.crawl(crawl_candidate)
            except (UnicodeDecodeError, ValueError) as ex:
                if parsers_lst:
                    parser = parsers_lst.pop(0)  # remove it also!
                    return crawler_wrapper(parser, parsers_lst,
                                           crawl_candidate)
                else:
                    raise ex
            return article

        ''' use the wrapper '''
        parsers = list(self.config.available_parsers)
        parsers.remove(self.config.parser_class)
        return crawler_wrapper(self.config.parser_class, parsers,
                               crawl_candidate)

    def __parse(self, crawl_candidate):
        def crawler_wrapper(parser, parsers_lst, crawl_candidate):
            try:
                crawler = CustomCrawler(self.config, self.fetcher)
                article = crawler.crawl(crawl_candidate)
            except (UnicodeDecodeError, ValueError) as ex:
                if parsers_lst:
                    parser = parsers_lst.pop(0)  # remove it also!
                    return crawler_wrapper(parser, parsers_lst,
                                           crawl_candidate)
                else:
                    raise ex
            return article

        ''' use the wrapper '''
        parsers = list(self.config.available_parsers)
        parsers.remove(self.config.parser_class)
        return crawler_wrapper(self.config.parser_class, parsers,
                               crawl_candidate)