def __init__(self, config, fetcher=None): # config self.config = config # parser self.parser = self.config.get_parser() # article self.article = Article() # init the extractor self.extractor = self.get_extractor() # init the document cleaner self.cleaner = self.get_cleaner() # init the output formatter self.formatter = self.get_formatter() # metas extractor self.metas_extractor = self.get_metas_extractor() # opengraph extractor self.opengraph_extractor = self.get_opengraph_extractor() # schema.org news article extractor self.schema_extractor = self.get_schema_extractor() # publishdate extractor self.publishdate_extractor = self.get_publishdate_extractor() # tags extractor self.tags_extractor = self.get_tags_extractor() # authors extractor self.authors_extractor = self.get_authors_extractor() # tweets extractor self.tweets_extractor = self.get_tweets_extractor() # links extractor self.links_extractor = self.get_links_extractor() # video extractor self.video_extractor = self.get_video_extractor() # title extractor self.title_extractor = self.get_title_extractor() # html fetcher if isinstance(fetcher, NetworkFetcher): self.fetcher = fetcher else: self.fetcher = NetworkFetcher(self.config) # image extractor self.image_extractor = self.get_image_extractor() # TODO: use the log prefix self.log_prefix = "crawler: "
def __init__(self, config): # config self.config = config # parser self.parser = self.config.get_parser() # article self.article = Article() # init the extractor self.extractor = self.get_extractor() # init the document cleaner self.cleaner = self.get_cleaner() # init the output formatter self.formatter = self.get_formatter() # metas extractor self.metas_extractor = self.get_metas_extractor() # publishdate extractor self.publishdate_extractor = self.get_publishdate_extractor() # opengraph extractor self.opengraph_extractor = self.get_opengraph_extractor() # tags extractor self.tags_extractor = self.get_tags_extractor() # authors extractor self.authors_extractor = self.get_authors_extractor() # tweets extractor self.tweets_extractor = self.get_tweets_extractor() # links extractor self.links_extractor = self.get_links_extractor() # video extractor self.video_extractor = self.get_video_extractor() # title extractor self.title_extractor = self.get_title_extractor() # html fetcher self.fetcher = NetworkFetcher(self.config) # image extractor self.image_extractor = self.get_image_extractor() # TODO : log prefix self.logPrefix = "crawler:"
def __init__(self, config=None): # Use the passed in configuration if it is of the right type, otherwise # use the default as a base if isinstance(config, Configuration): self.config = config else: self.config = Configuration() # if config was a passed in dict, parse it into the stored configuration if isinstance(config, dict): for k, v in list(config.items()): if hasattr(self.config, k): setattr(self.config, k, v) # setup a single network connection self.fetcher = NetworkFetcher(self.config) self.finalizer = weakref.finalize(self, self.close) # we don't need to go further if image extractor or local_storage is not set if not self.config.local_storage_path or not self.config.enable_image_fetching: return # test if config.local_storage_path is a directory if not os.path.isdir(self.config.local_storage_path): os.makedirs(self.config.local_storage_path) if not os.path.isdir(self.config.local_storage_path): msg = ( '{} directory does not seem to exist, you need to set this for ' 'image processing downloads').format( self.config.local_storage_path) raise Exception(msg) # test to write a dummy file to the directory to check is directory is writable level, path = mkstemp(dir=self.config.local_storage_path) try: with os.fdopen(level, "w"): pass os.remove(path) except IOError: msg = ( '{} directory is not writeble, you need to set this for image ' 'processing downloads').format(self.config.local_storage_path) raise Exception(msg)
class Crawler(object): def __init__(self, config, fetcher=None): # config self.config = config # parser self.parser = self.config.get_parser() # article self.article = Article() # init the extractor self.extractor = self.get_extractor() # init the document cleaner self.cleaner = self.get_cleaner() # init the output formatter self.formatter = self.get_formatter() # metas extractor self.metas_extractor = self.get_metas_extractor() # opengraph extractor self.opengraph_extractor = self.get_opengraph_extractor() # schema.org news article extractor self.schema_extractor = self.get_schema_extractor() # publishdate extractor self.publishdate_extractor = self.get_publishdate_extractor() # tags extractor self.tags_extractor = self.get_tags_extractor() # authors extractor self.authors_extractor = self.get_authors_extractor() # tweets extractor self.tweets_extractor = self.get_tweets_extractor() # links extractor self.links_extractor = self.get_links_extractor() # video extractor self.video_extractor = self.get_video_extractor() # title extractor self.title_extractor = self.get_title_extractor() # html fetcher if isinstance(fetcher, NetworkFetcher): self.fetcher = fetcher else: self.fetcher = NetworkFetcher(self.config) # image extractor self.image_extractor = self.get_image_extractor() # TODO: use the log prefix self.log_prefix = "crawler: " def crawl(self, crawl_candidate): # parser candidate parse_candidate = self.get_parse_candidate(crawl_candidate) # raw html raw_html = self.get_html(crawl_candidate, parse_candidate) if raw_html is None: return self.article return self.process(raw_html, parse_candidate.url, parse_candidate.link_hash) def process(self, raw_html, final_url, link_hash): # create document doc = self.get_document(raw_html) # article self.article._final_url = final_url self.article._link_hash = link_hash self.article._raw_html = raw_html self.article._doc = doc self.article._raw_doc = deepcopy(doc) # open graph self.article._opengraph = self.opengraph_extractor.extract() # schema.org: # - (ReportageNewsArticle) https://pending.schema.org/ReportageNewsArticle # - (NewsArticle) https://schema.org/NewsArticle # - (Article) https://schema.org/Article self.article._schema = self.schema_extractor.extract() if not self.article._final_url: if "url" in self.article.opengraph: self.article._final_url = self.article.opengraph["url"] elif self.article.schema and "url" in self.article.schema: self.article._final_url = self.article.schema["url"] # meta metas = self.metas_extractor.extract() # print(metas) self.article._meta_lang = metas['lang'] self.article._meta_favicon = metas['favicon'] self.article._meta_description = metas['description'] self.article._meta_keywords = metas['keywords'] self.article._meta_encoding = metas['encoding'] self.article._canonical_link = metas['canonical'] self.article._domain = metas['domain'] # publishdate self.article._publish_date = self.publishdate_extractor.extract() if self.article.publish_date: try: publish_datetime = dateutil.parser.parse( self.article.publish_date) if publish_datetime.tzinfo: self.article._publish_datetime_utc = publish_datetime.astimezone( tzutc()) else: self.article._publish_datetime_utc = publish_datetime except (ValueError, OverflowError): self.article._publish_datetime_utc = None # tags self.article._tags = self.tags_extractor.extract() # authors self.article._authors = self.authors_extractor.extract() # title self.article._title = self.title_extractor.extract() # check for known node as content body # if we find one force the article.doc to be the found node # this will prevent the cleaner to remove unwanted text content article_body = self.extractor.get_known_article_tags() if article_body is not None: doc = article_body # before we do any calcs on the body itself let's clean up the document if not isinstance(doc, list): doc = [self.cleaner.clean(doc)] else: doc = [self.cleaner.clean(deepcopy(x)) for x in doc] # big stuff self.article._top_node = self.extractor.calculate_best_node(doc) # if we do not find an article within the discovered possible article nodes, # try again with the root node. if self.article._top_node is None: # try again with the root node. self.article._top_node = self.extractor.calculate_best_node( self.article._doc) else: # set the doc member to the discovered article node. self.article._doc = doc # if we have a top node # let's process it if self.article._top_node is not None: # article links self.article._links = self.links_extractor.extract() # tweets self.article._tweets = self.tweets_extractor.extract() # video handling self.article._movies = self.video_extractor.get_videos() # image handling if self.config.enable_image_fetching: self.get_image() # post cleanup self.article._top_node = self.extractor.post_cleanup() # clean_text self.article._cleaned_text = self.formatter.get_formatted_text() # cleanup tmp file self.release_resources() # return the article return self.article @staticmethod def get_parse_candidate(crawl_candidate): if crawl_candidate.raw_html: return RawHelper.get_parsing_candidate(crawl_candidate.url, crawl_candidate.raw_html) return URLHelper.get_parsing_candidate(crawl_candidate.url) def get_image(self): doc = self.article.raw_doc top_node = self.article.top_node self.article._top_image = self.image_extractor.get_best_image( doc, top_node) def get_html(self, crawl_candidate, parsing_candidate): # we got a raw_tml # no need to fetch remote content if crawl_candidate.raw_html: return crawl_candidate.raw_html # fetch HTML response = self.fetcher.fetch_obj(parsing_candidate.url) if response.encoding != 'ISO-8859-1': # requests has a good idea; use what it says # return response as a unicode string html = response.text self.article._meta_encoding = response.encoding else: html = response.content encodings = get_encodings_from_content(response.text) if len(encodings) > 0: self.article._meta_encoding = encodings[0] response.encoding = encodings[0] html = response.text else: self.article._meta_encoding = encodings return html def get_metas_extractor(self): return MetasExtractor(self.config, self.article) def get_publishdate_extractor(self): return PublishDateExtractor(self.config, self.article) def get_opengraph_extractor(self): return OpenGraphExtractor(self.config, self.article) def get_schema_extractor(self): return SchemaExtractor(self.config, self.article) def get_tags_extractor(self): return TagsExtractor(self.config, self.article) def get_authors_extractor(self): return AuthorsExtractor(self.config, self.article) def get_tweets_extractor(self): return TweetsExtractor(self.config, self.article) def get_links_extractor(self): return LinksExtractor(self.config, self.article) def get_title_extractor(self): return TitleExtractor(self.config, self.article) def get_image_extractor(self): return ImageExtractor(self.fetcher, self.config, self.article) def get_video_extractor(self): return VideoExtractor(self.config, self.article) def get_formatter(self): return StandardOutputFormatter(self.config, self.article) def get_cleaner(self): return StandardDocumentCleaner(self.config, self.article) def get_document(self, raw_html): doc = self.parser.fromstring(raw_html) return doc def get_extractor(self): return StandardContentExtractor(self.config, self.article) def release_resources(self): path = os.path.join(self.config.local_storage_path, '%s_*' % self.article.link_hash) for fname in glob.glob(path): try: os.remove(fname) except OSError: # TODO: better log handeling pass
class Goose(object): def __init__(self, config=None): # Use the passed in configuration if it is of the right type, otherwise # use the default as a base if isinstance(config, Configuration): self.config = config else: self.config = Configuration() # if config was a passed in dict, parse it into the stored configuration if isinstance(config, dict): for k, v in list(config.items()): if hasattr(self.config, k): setattr(self.config, k, v) # setup a single network connection self.fetcher = NetworkFetcher(self.config) self.finalizer = weakref.finalize(self, self.close) # we don't need to go further if image extractor or local_storage is not set if not self.config.local_storage_path or not self.config.enable_image_fetching: return # test if config.local_storage_path is a directory if not os.path.isdir(self.config.local_storage_path): os.makedirs(self.config.local_storage_path) if not os.path.isdir(self.config.local_storage_path): msg = ( '{} directory does not seem to exist, you need to set this for ' 'image processing downloads').format( self.config.local_storage_path) raise Exception(msg) # test to write a dummy file to the directory to check is directory is writable level, path = mkstemp(dir=self.config.local_storage_path) try: with os.fdopen(level, "w"): pass os.remove(path) except IOError: msg = ( '{} directory is not writeble, you need to set this for image ' 'processing downloads').format(self.config.local_storage_path) raise Exception(msg) def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() def close(self): ''' close the connection and any other cleanup required ''' if self.fetcher != None: self.shutdown_network() self.finalizer.atexit = False # turn off the garbage collection close def extract(self, url=None, raw_html=None): """ Main method to extract an article object from a URL, pass in a url and get back a Article """ crawl_candidate = CrawlCandidate(self.config, url, raw_html) return self.crawl(crawl_candidate) def shutdown_network(self): ''' ensure the connection is closed ''' self.fetcher.close() self.fetcher = None def crawl(self, crawl_candidate): parsers = list(self.config.available_parsers) parsers.remove(self.config.parser_class) try: crawler = Crawler(self.config, self.fetcher) article = crawler.crawl(crawl_candidate) except (UnicodeDecodeError, ValueError) as ex: if parsers: self.config.parser_class = parsers[0] return self.crawl(crawl_candidate) else: raise ex return article
class Goose(object): ''' Extract most likely article content and aditional metadata from a URL or previously fetched HTML document Args: config (Configuration, dict): A configuration file or dictionary \ representation of the configuration file Returns: Goose: An instance of the goose extraction object ''' def __init__(self, config=None): # Use the passed in configuration if it is of the right type, otherwise # use the default as a base if isinstance(config, Configuration): self.config = config else: self.config = Configuration() # if config was a passed in dict, parse it into the stored configuration if isinstance(config, dict): for k, v in list(config.items()): if hasattr(self.config, k): setattr(self.config, k, v) # setup a single network connection self.fetcher = NetworkFetcher(self.config) self.finalizer = weakref.finalize(self, self.close) # we don't need to go further if image extractor or local_storage is not set if not self.config.local_storage_path or not self.config.enable_image_fetching: return # test if config.local_storage_path is a directory if not os.path.isdir(self.config.local_storage_path): os.makedirs(self.config.local_storage_path) if not os.path.isdir(self.config.local_storage_path): msg = ( '{} directory does not seem to exist, you need to set this for ' 'image processing downloads').format( self.config.local_storage_path) raise Exception(msg) # test to write a dummy file to the directory to check is directory is writable level, path = mkstemp(dir=self.config.local_storage_path) try: with os.fdopen(level, "w"): pass os.remove(path) except IOError: msg = ( '{} directory is not writeble, you need to set this for image ' 'processing downloads').format(self.config.local_storage_path) raise Exception(msg) def __enter__(self): ''' Setup the context manager ''' return self def __exit__(self, exc_type, exc_val, exc_tb): ''' Define what to do when the context manager exits ''' self.close() def close(self): ''' Close the network connection and perform any other required cleanup Note: Auto closed when using goose as a context manager or when garbage collected ''' if self.fetcher is not None: self.shutdown_network() self.finalizer.atexit = False # turn off the garbage collection close def extract(self, url=None, raw_html=None): ''' Extract the most likely article content from the html page Args: url (str): URL to pull and parse raw_html (str): String representation of the HTML page Returns: Article: Representation of the article contents \ including other parsed and extracted metadata ''' crawl_candidate = CrawlCandidate(self.config, url, raw_html) return self.__crawl(crawl_candidate) def shutdown_network(self): ''' Close the network connection Note: Auto closed when using goose as a context manager or when garbage collected ''' self.fetcher.close() self.fetcher = None def __crawl(self, crawl_candidate): parsers = list(self.config.available_parsers) parsers.remove(self.config.parser_class) try: crawler = Crawler(self.config, self.fetcher) article = crawler.crawl(crawl_candidate) except (UnicodeDecodeError, ValueError) as ex: if parsers: self.config.parser_class = parsers[0] return self.__crawl(crawl_candidate) else: raise ex return article
class Crawler(object): def __init__(self, config, fetcher=None): # config self.config = config # parser self.parser = self.config.get_parser() # article self.article = Article() # init the extractor self.extractor = self.get_extractor() # init the document cleaner self.cleaner = self.get_cleaner() # init the output formatter self.formatter = self.get_formatter() # metas extractor self.metas_extractor = self.get_metas_extractor() # publishdate extractor self.publishdate_extractor = self.get_publishdate_extractor() # opengraph extractor self.opengraph_extractor = self.get_opengraph_extractor() # tags extractor self.tags_extractor = self.get_tags_extractor() # authors extractor self.authors_extractor = self.get_authors_extractor() # tweets extractor self.tweets_extractor = self.get_tweets_extractor() # links extractor self.links_extractor = self.get_links_extractor() # video extractor self.video_extractor = self.get_video_extractor() # title extractor self.title_extractor = self.get_title_extractor() # html fetcher if isinstance(fetcher, NetworkFetcher): self.fetcher = fetcher else: self.fetcher = NetworkFetcher(self.config) # image extractor self.image_extractor = self.get_image_extractor() # TODO: use the log prefix self.log_prefix = "crawler: " def crawl(self, crawl_candidate): # parser candidate parse_candidate = self.get_parse_candidate(crawl_candidate) # raw html raw_html = self.get_html(crawl_candidate, parse_candidate) if raw_html is None: return self.article return self.process(raw_html, parse_candidate.url, parse_candidate.link_hash) def process(self, raw_html, final_url, link_hash): # create document doc = self.get_document(raw_html) # article self.article.final_url = final_url self.article.link_hash = link_hash self.article.raw_html = raw_html self.article.doc = doc self.article.raw_doc = deepcopy(doc) # open graph self.article.opengraph = self.opengraph_extractor.extract() # publishdate self.article.publish_date = self.publishdate_extractor.extract() # meta metas = self.metas_extractor.extract() # print(metas) self.article.meta_lang = metas['lang'] self.article.meta_favicon = metas['favicon'] self.article.meta_description = metas['description'] self.article.meta_keywords = metas['keywords'] self.article.canonical_link = metas['canonical'] self.article.domain = metas['domain'] # tags self.article.tags = self.tags_extractor.extract() # authors self.article.authors = self.authors_extractor.extract() # title self.article.title = self.title_extractor.extract() # check for known node as content body # if we find one force the article.doc to be the found node # this will prevent the cleaner to remove unwanted text content article_body = self.extractor.get_known_article_tags() if article_body is not None: self.article.doc = article_body # before we do any calcs on the body itself let's clean up the document if not isinstance(self.article.doc, list): self.article.doc = [self.cleaner.clean(self.article.doc)] else: self.article.doc = [ self.cleaner.clean(deepcopy(x)) for x in self.article.doc ] # big stuff self.article.top_node = self.extractor.calculate_best_node() # if we have a top node # let's process it if self.article.top_node is not None: # article links self.article.links = self.links_extractor.extract() # tweets self.article.tweets = self.tweets_extractor.extract() # video handling self.video_extractor.get_videos() # image handling if self.config.enable_image_fetching: self.get_image() # post cleanup self.article.top_node = self.extractor.post_cleanup() # clean_text self.article.cleaned_text = self.formatter.get_formatted_text() # cleanup tmp file self.release_resources() # return the article return self.article @staticmethod def get_parse_candidate(crawl_candidate): if crawl_candidate.raw_html: return RawHelper.get_parsing_candidate(crawl_candidate.url, crawl_candidate.raw_html) return URLHelper.get_parsing_candidate(crawl_candidate.url) def get_image(self): doc = self.article.raw_doc top_node = self.article.top_node self.article.top_image = self.image_extractor.get_best_image( doc, top_node) def get_html(self, crawl_candidate, parsing_candidate): # we got a raw_tml # no need to fetch remote content if crawl_candidate.raw_html: return crawl_candidate.raw_html # fetch HTML html = self.fetcher.fetch(parsing_candidate.url) return html def get_metas_extractor(self): return MetasExtractor(self.config, self.article) def get_publishdate_extractor(self): return PublishDateExtractor(self.config, self.article) def get_opengraph_extractor(self): return OpenGraphExtractor(self.config, self.article) def get_tags_extractor(self): return TagsExtractor(self.config, self.article) def get_authors_extractor(self): return AuthorsExtractor(self.config, self.article) def get_tweets_extractor(self): return TweetsExtractor(self.config, self.article) def get_links_extractor(self): return LinksExtractor(self.config, self.article) def get_title_extractor(self): return TitleExtractor(self.config, self.article) def get_image_extractor(self): return ImageExtractor(self.fetcher, self.config, self.article) def get_video_extractor(self): return VideoExtractor(self.config, self.article) def get_formatter(self): return StandardOutputFormatter(self.config, self.article) def get_cleaner(self): return StandardDocumentCleaner(self.config, self.article) def get_document(self, raw_html): doc = self.parser.fromstring(raw_html) return doc def get_extractor(self): return StandardContentExtractor(self.config, self.article) def release_resources(self): path = os.path.join(self.config.local_storage_path, '%s_*' % self.article.link_hash) for fname in glob.glob(path): try: os.remove(fname) except OSError: # TODO: better log handeling pass
class Crawler(object): def __init__(self, config, fetcher=None): # config self.config = config # parser self.parser = self.config.get_parser() # article self.article = Article() # init the extractor self.extractor = self.get_extractor() # init the document cleaner self.cleaner = self.get_cleaner() # init the output formatter self.formatter = self.get_formatter() # metas extractor self.metas_extractor = self.get_metas_extractor() # opengraph extractor self.opengraph_extractor = self.get_opengraph_extractor() # schema.org news article extractor self.schema_extractor = self.get_schema_extractor() # publishdate extractor self.publishdate_extractor = self.get_publishdate_extractor() # tags extractor self.tags_extractor = self.get_tags_extractor() # authors extractor self.authors_extractor = self.get_authors_extractor() # tweets extractor self.tweets_extractor = self.get_tweets_extractor() # links extractor self.links_extractor = self.get_links_extractor() # video extractor self.video_extractor = self.get_video_extractor() # title extractor self.title_extractor = self.get_title_extractor() # html fetcher if isinstance(fetcher, NetworkFetcher): self.fetcher = fetcher else: self.fetcher = NetworkFetcher(self.config) # image extractor self.image_extractor = self.get_image_extractor() # microdata extractor self.microdata_extractor = self.get_microdata_extractor(); # hCard extractor self.hcard_extractor = self.get_hcard_extractor(); # TODO: use the log prefix self.log_prefix = "crawler: " def crawl(self, crawl_candidate, crawl_sub=True): # parser candidate parse_candidate = self.get_parse_candidate(crawl_candidate) doc = None if crawl_candidate.doc is None: # raw html raw_html = self.get_html(crawl_candidate, parse_candidate) if raw_html is None: return self.article else: doc = crawl_candidate.doc raw_html = None return self.process( raw_html, parse_candidate.url, parse_candidate.link_hash, doc, crawl_sub) def process(self, raw_html, final_url, link_hash, doc=None, crawl_sub=False): # create document if doc is None: doc = self.get_document(raw_html) # article self.article._final_url = final_url self.article.site_domain = goose3.text.get_site_domain(final_url) self.article._link_hash = link_hash self.article._raw_html = raw_html self.article.doc = doc self.article._raw_doc = deepcopy(doc) # open graph self.article._opengraph = self.opengraph_extractor.extract() # schema.org: # - (ReportageNewsArticle) https://pending.schema.org/ReportageNewsArticle # - (NewsArticle) https://schema.org/NewsArticle # - (Article) https://schema.org/Article self.article._schema = self.schema_extractor.extract() if not self.article._final_url: if "url" in self.article.opengraph: self.article._final_url = self.article.opengraph["url"] elif self.article.schema and "url" in self.article.schema: self.article._final_url = self.article.schema["url"] # meta metas = self.metas_extractor.extract() # print(metas) self.article._meta_lang = metas['lang'] self.article._meta_favicon = metas['favicon'] self.article._meta_description = metas['description'] self.article._meta_keywords = metas['keywords'] self.article._meta_encoding = metas['encoding'] self.article._canonical_link = metas['canonical'] self.article._domain = metas['domain'] self.article.metatags = metas['metatags'] # publishdate self.article._publish_date = self.publishdate_extractor.extract() if self.article.publish_date: try: publish_datetime = dateutil.parser.parse(self.article.publish_date) if publish_datetime.tzinfo: self.article._publish_datetime_utc = publish_datetime.astimezone(tzutc()) else: self.article._publish_datetime_utc = publish_datetime except (ValueError, OverflowError): self.article._publish_datetime_utc = None # tags self.article._tags = self.tags_extractor.extract() # Parse json ld json_ld_tags = self.parser.xpath_re( self.article.doc, 'descendant::script[@type="application/ld+json"]') if json_ld_tags: json_ld_text = self.parser.getText(json_ld_tags[0]) for i in range(2): try: self.article.json_ld = json.loads(json_ld_text) except Exception as ex: if i == 0: json_ld_text = json_ld_text.replace('""', '", "') for sub_article in self.article.sub_articles: if sub_article.node == self.article.doc: continue self.parser.remove(sub_article.node) self.article.doc = self.cleaner.remove_nested_article_tags(self.article.doc) # microdata self.article.microdata = self.microdata_extractor.extract() # authors self.article._authors = self.authors_extractor.extract() # title self.article._title = self.title_extractor.extract() # hcard self.article.hcards = self.hcard_extractor.extract() self.article.read_more_url = self.links_extractor.extract_read_more() # check for known node as content body # if we find one force the article.doc to be the found node # this will prevent the cleaner to remove unwanted text content # article_body = self.extractor.get_known_article_tags() if crawl_sub: article_body = self.extractor.get_known_article_tags() # article_body = articles[0] if articles else None else: article_body = None if article_body is not None: doc = article_body # before we do any calcs on the body itself let's clean up the document if not isinstance(doc, list): doc_nodes = [self.cleaner.clean(doc)] else: doc_nodes = [self.cleaner.clean(deepcopy(x)) for x in doc] # big stuff self.article._top_node = self.extractor.calculate_best_node(doc_nodes) # if we do not find an article within the discovered possible article nodes, # try again with the root node. if self.article._top_node is None: # try again with the root node. self.article._top_node = self.extractor.calculate_best_node(self.article._doc) if self.article.top_node is None: self.article._top_node = self.article.doc else: # set the doc member to the discovered article node. # self.article._doc = doc self.article.doc = doc[0] if isinstance(doc, list) else doc # if we have a top node # let's process it if self.article._top_node is not None: # article links self.article._links = self.links_extractor.extract() self.article.html_links = self.links_extractor.extract_html_links() # tweets self.article._tweets = self.tweets_extractor.extract() # video handling self.article._movies = self.video_extractor.get_videos() # image handling if self.config.enable_image_fetching: self.get_image() # post cleanup if crawl_sub: self.article._top_node = self.extractor.post_cleanup() # clean_text self.article._cleaned_text = self.formatter.get_formatted_text( remove_fewwords=crawl_sub) # cleanup tmp file self.release_resources() if crawl_sub and len(self.article.sub_articles) > 1: active_sub_articles = [] for i in range(len(self.article.sub_articles)): sub_article = self.article.sub_articles[i] if sub_article.node == self.article.doc: continue crawler = Crawler(self.config) crawled_article = crawler.crawl( CrawlCandidate( self.config, final_url, raw_html=sub_article.outer_html), crawl_sub=False ) sub_article.crawled_article = crawled_article active_sub_articles.append(sub_article) del self.article.sub_articles[:] self.article.sub_articles.extend(active_sub_articles) if crawl_sub and self.article.sub_articles: self.article.sub_articles.sort( key=lambda obj: -len(obj.cleaned_text)) if not self.article.cleaned_text and \ self.article.sub_articles[0].crawled_article: self.article.cleaned_text = \ self.article.sub_articles[0].crawled_article.cleaned_text if not self.article.authors: self.article.authors = \ self.article.sub_articles[0].authors # return the article return self.article @staticmethod def get_parse_candidate(crawl_candidate): if crawl_candidate.doc is not None: return SubArticle.get_parsing_candidate(crawl_candidate.doc) if crawl_candidate.raw_html: return RawHelper.get_parsing_candidate(crawl_candidate.url, crawl_candidate.raw_html) return URLHelper.get_parsing_candidate(crawl_candidate.url) def get_image(self): doc = self.article.raw_doc top_node = self.article.top_node self.article._top_image = self.image_extractor.get_best_image(doc, top_node) def get_html(self, crawl_candidate, parsing_candidate): # we got a raw_tml # no need to fetch remote content if crawl_candidate.raw_html: return crawl_candidate.raw_html # fetch HTML response = self.fetcher.fetch_obj(parsing_candidate.url) if response.encoding != 'ISO-8859-1': # requests has a good idea; use what it says # return response as a unicode string html = response.text self.article._meta_encoding = response.encoding else: html = response.content encodings = get_encodings_from_content(response.text) if len(encodings) > 0: self.article._meta_encoding = encodings[0] response.encoding = encodings[0] html = response.text else: self.article._meta_encoding = encodings if not html: html = "" crawl_candidate.raw_html = html # Twitter/Facebook specific news crawling. Should be transferred to separate module. site_domain = goose3.text.get_site_domain(parsing_candidate.url) if site_domain == "twitter.com": doc = self.parser.fromstring(html) a_links = self.parser.getElementsByTag( doc, tag='a', attr='class', value='twitter-timeline-link') if a_links: parsing_candidate.url = self.parser.getAttribute(a_links[0], 'href') html = self.fetcher.fetch(parsing_candidate.url) crawl_candidate.raw_html = html elif site_domain == "www.facebook.com" and "/posts/" in parsing_candidate.url: html = html.replace("<!--", "") html = html.replace("-->", "") doc = self.parser.fromstring(html) a_links = self.parser.xpath_re( doc, "//*[@class='hidden_elem']/descendant::a") link_re = re.compile(r"https?://l\.facebook\.com/l\.php\?u=(?P<url>[^&]+)&h") for a_link in a_links: href = a_link.attrib.get('href') match = link_re.search(href) if match: url = match.groupdict()["url"] parsing_candidate.url = urllib.parse.unquote(url) html = self.fetcher.fetch(parsing_candidate.url) crawl_candidate.raw_html = html break return html def get_metas_extractor(self): return MetasExtractor(self.config, self.article) def get_publishdate_extractor(self): return PublishDateExtractor(self.config, self.article) def get_opengraph_extractor(self): return OpenGraphExtractor(self.config, self.article) def get_schema_extractor(self): return SchemaExtractor(self.config, self.article) def get_tags_extractor(self): return TagsExtractor(self.config, self.article) def get_authors_extractor(self): return AuthorsExtractor(self.config, self.article) def get_tweets_extractor(self): return TweetsExtractor(self.config, self.article) def get_links_extractor(self): return LinksExtractor(self.config, self.article) def get_title_extractor(self): return TitleExtractor(self.config, self.article) def get_image_extractor(self): return ImageExtractor(self.fetcher, self.config, self.article) def get_video_extractor(self): return VideoExtractor(self.config, self.article) def get_microdata_extractor(self): return MicroDataExtractor(self.config, self.article) def get_hcard_extractor(self): return HCardExtractor(self.config, self.article) def get_formatter(self): return StandardOutputFormatter(self.config, self.article) def get_cleaner(self): return StandardDocumentCleaner(self.config, self.article) def get_document(self, raw_html): doc = self.parser.fromstring(raw_html) return doc def get_extractor(self): return StandardContentExtractor(self.config, self.article) def release_resources(self): path = os.path.join(self.config.local_storage_path, '%s_*' % self.article.link_hash) for fname in glob.glob(path): try: os.remove(fname) except OSError: # TODO: better log handeling pass
class Crawler(object): def __init__(self, config, fetcher=None): # config self.config = config # parser self.parser = self.config.get_parser() # article self.article = Article() # init the extractor self.extractor = self.get_extractor() # init the document cleaner self.cleaner = self.get_cleaner() # init the output formatter self.formatter = self.get_formatter() # metas extractor self.metas_extractor = self.get_metas_extractor() # opengraph extractor self.opengraph_extractor = self.get_opengraph_extractor() # schema.org news article extractor self.schema_extractor = self.get_schema_extractor() # publishdate extractor self.publishdate_extractor = self.get_publishdate_extractor() # tags extractor self.tags_extractor = self.get_tags_extractor() # authors extractor self.authors_extractor = self.get_authors_extractor() # tweets extractor self.tweets_extractor = self.get_tweets_extractor() # links extractor self.links_extractor = self.get_links_extractor() # video extractor self.video_extractor = self.get_video_extractor() # title extractor self.title_extractor = self.get_title_extractor() # html fetcher if isinstance(fetcher, NetworkFetcher): self.fetcher = fetcher else: self.fetcher = NetworkFetcher(self.config) # image extractor self.image_extractor = self.get_image_extractor() # TODO: use the log prefix self.log_prefix = "crawler: " def crawl(self, crawl_candidate): # parser candidate parse_candidate = self.get_parse_candidate(crawl_candidate) # raw html raw_html = self.get_html(crawl_candidate, parse_candidate) if raw_html is None: return self.article return self.process(raw_html, parse_candidate.url, parse_candidate.link_hash) def process(self, raw_html, final_url, link_hash): # create document doc = self.get_document(raw_html) # article self.article._final_url = final_url self.article._link_hash = link_hash self.article._raw_html = raw_html self.article._doc = doc self.article._raw_doc = deepcopy(doc) # open graph self.article._opengraph = self.opengraph_extractor.extract() # schema.org: # - (ReportageNewsArticle) https://pending.schema.org/ReportageNewsArticle # - (NewsArticle) https://schema.org/NewsArticle # - (Article) https://schema.org/Article self.article._schema = self.schema_extractor.extract() if not self.article._final_url: if "url" in self.article.opengraph: self.article._final_url = self.article.opengraph["url"] elif self.article.schema and "url" in self.article.schema: self.article._final_url = self.article.schema["url"] # meta metas = self.metas_extractor.extract() # print(metas) self.article._meta_lang = metas['lang'] self.article._meta_favicon = metas['favicon'] self.article._meta_description = metas['description'] self.article._meta_keywords = metas['keywords'] self.article._meta_encoding = metas['encoding'] self.article._canonical_link = metas['canonical'] if 'domain' in metas: self.article._domain = metas['domain'] else: self.article._domain = up.urlparse(self.article.final_url).netloc # publishdate if 'datePublished' in self.article.schema: self.article._publish_date = self.article.schema['datePublished'] else: self.article._publish_date = self.publishdate_extractor.extract() if self.article.publish_date: try: publish_datetime = dateutil.parser.parse( self.article.publish_date) if publish_datetime.tzinfo: self.article._publish_datetime_utc = publish_datetime.astimezone( tzutc()) else: self.article._publish_datetime_utc = publish_datetime except (ValueError, OverflowError): self.article._publish_datetime_utc = None # tags self.article._tags = self.tags_extractor.extract() # authors if 'author' in self.article.schema: if isinstance(self.article.schema['author'], list): self.article._authors = list( map(lambda entry: entry['name'], self.article.schema['author'])) elif isinstance(self.article.schema['author'], dict): author_string = self.article.schema['author'][ 'name'] if 'name' in self.article.schema['author'] else '' if not author_string: if 'publisher' in self.article.schema: if 'name' in self.article.schema['publisher']: self.article._authors = [ self.article.schema['publisher']['name'] ] self.article._authors = [] elif ',' in author_string: self.article._authors = list( map(str.strip, author_string.split(','))) elif ' und ' in author_string: self.article._authors = list( map(str.strip, author_string.split(' und '))) elif ' and ' in author_string: self.article._authors = list( map(str.strip, author_string.split(' and '))) else: self.article._authors = [author_string] else: self.article._authors = self.authors_extractor.extract() self.article._authors = list(map(str.title, self.article.authors)) # title self.article._title = self.title_extractor.extract() # check for known node as content body # if we find one force the article.doc to be the found node # this will prevent the cleaner to remove unwanted text content if 'articleBody' in self.article.schema: self.article._cleaned_text = self.clean_plain_text( self.article.schema['articleBody']) elif 'articleBody' in metas: self.article._cleaned_text = self.clean_plain_text( metas['articleBody']) elif 'articleBody' in self.article.opengraph: self.article._cleaned_text = self.clean_plain_text( self.article.opengraph['articleBody']) else: (article_body, domain_match) = self.extractor.get_known_article_tags() if article_body is not None: doc = article_body # before we do any calcs on the body itself let's clean up the document if not isinstance(doc, list): doc = [self.cleaner.clean(doc)] else: doc = [self.cleaner.clean(deepcopy(x)) for x in doc] # get the full text content and set cleaned_text as a fallback self.article._cleaned_text = " ".join( self.extractor.get_full_text(doc)) # otherwise compute the best node self.article._top_node = self.extractor.calculate_best_node( doc, domain_match) # if we do not find an article within the discovered possible article nodes, # try again with the root node. if self.article._top_node is None: # try again with the root node. self.article._top_node = self.extractor.calculate_best_node( self.article._doc, domain_match) else: # set the doc member to the discovered article node. self.article._doc = doc # if we have a top node # let's process it if self.article._top_node is not None: # article links self.article._links = self.links_extractor.extract() # tweets self.article._tweets = self.tweets_extractor.extract() # video handling self.article._movies = self.video_extractor.get_videos() # image handling if self.config.enable_image_fetching: self.get_image() # post cleanup self.article._top_node = self.extractor.post_cleanup() # clean_text self.article._cleaned_text = self.clean_plain_text( self.formatter.get_formatted_text()) # check for image in linked data if self.config.enable_image_fetching: if 'image' in self.article.schema: self.article._top_image = self.get_image_extractor().get_image( self.article.schema['image']['url'], extraction_type="Linked Data") if self.article.cleaned_text and self.article.cleaned_text != '': self.article._meta_lang = langdetect.detect( self.article.cleaned_text) # cleanup tmp file self.release_resources() # return the article return self.article @staticmethod def get_parse_candidate(crawl_candidate): if crawl_candidate.raw_html: return RawHelper.get_parsing_candidate(crawl_candidate.url, crawl_candidate.raw_html) return URLHelper.get_parsing_candidate(crawl_candidate.url) def clean_plain_text(self, raw_text): # strip html tags res = re.sub("<.*?>", "", raw_text) # replace strange spaces res = unicodedata.normalize("NFKC", res) # replace soft hyphens res = re.sub('[\xc2\xad]', '', res) # replace french quotation marks res = re.sub("[\u00BB\u00AB\u201C\u201D\u201E]", '"', res) # replace line feeds res = res.replace('\n', ' ') # shrink multiple spaces to one res = re.sub("\s+", " ", res) return res def get_image(self): doc = self.article.raw_doc top_node = self.article.top_node self.article._top_image = self.image_extractor.get_best_image( doc, top_node) def get_html(self, crawl_candidate, parsing_candidate): # we got a raw_tml # no need to fetch remote content if crawl_candidate.raw_html: return crawl_candidate.raw_html # fetch HTML response = self.fetcher.fetch_obj(parsing_candidate.url) if response.encoding != 'ISO-8859-1': # requests has a good idea; use what it says # return response as a unicode string html = response.text self.article._meta_encoding = response.encoding else: html = response.content encodings = get_encodings_from_content(response.text) if len(encodings) > 0: self.article._meta_encoding = encodings[0] response.encoding = encodings[0] html = response.text else: self.article._meta_encoding = encodings return html def get_metas_extractor(self): return MetasExtractor(self.config, self.article) def get_publishdate_extractor(self): return PublishDateExtractor(self.config, self.article) def get_opengraph_extractor(self): return OpenGraphExtractor(self.config, self.article) def get_schema_extractor(self): return SchemaExtractor(self.config, self.article) def get_tags_extractor(self): return TagsExtractor(self.config, self.article) def get_authors_extractor(self): return AuthorsExtractor(self.config, self.article) def get_tweets_extractor(self): return TweetsExtractor(self.config, self.article) def get_links_extractor(self): return LinksExtractor(self.config, self.article) def get_title_extractor(self): return TitleExtractor(self.config, self.article) def get_image_extractor(self): return ImageExtractor(self.fetcher, self.config, self.article) def get_video_extractor(self): return VideoExtractor(self.config, self.article) def get_formatter(self): return StandardOutputFormatter(self.config, self.article) def get_cleaner(self): return StandardDocumentCleaner(self.config, self.article) def get_document(self, raw_html): doc = self.parser.fromstring(raw_html) return doc def get_extractor(self): return StandardContentExtractor(self.config, self.article) def release_resources(self): path = os.path.join(self.config.local_storage_path, '%s_*' % self.article.link_hash) for fname in glob.glob(path): try: os.remove(fname) except OSError: # TODO: better log handeling pass
class Crawler(object): def __init__(self, config: Configuration, fetcher=None): # config self.config = config # parser self.parser = self.config.get_parser() # article self.article = Article() # init the extractor self.extractor = self.get_extractor() # init the document cleaner self.cleaner = self.get_cleaner() # init the output formatter self.formatter = self.get_formatter() # metas extractor self.metas_extractor = self.get_metas_extractor() # opengraph extractor self.opengraph_extractor = self.get_opengraph_extractor() # schema.org news article extractor self.schema_extractor = self.get_schema_extractor() # publishdate extractor self.publishdate_extractor = self.get_publishdate_extractor() # tags extractor self.tags_extractor = self.get_tags_extractor() # authors extractor self.authors_extractor = self.get_authors_extractor() # tweets extractor self.tweets_extractor = self.get_tweets_extractor() # links extractor self.links_extractor = self.get_links_extractor() # video extractor self.video_extractor = self.get_video_extractor() # title extractor self.title_extractor = self.get_title_extractor() # html fetcher if isinstance(fetcher, NetworkFetcher): self.fetcher = fetcher else: self.fetcher = NetworkFetcher(self.config) # image extractor self.image_extractor = self.get_image_extractor() def crawl(self, crawl_candidate): # parser candidate parse_candidate = self.get_parse_candidate(crawl_candidate) # raw html raw_html = self.get_html(crawl_candidate, parse_candidate) if raw_html is None: logger.warning( "No raw_html is provided or could be fetched; continuing with an empty Article object" ) return self.article return self.process(raw_html, parse_candidate.url, parse_candidate.link_hash) def process(self, raw_html: str, final_url: str, link_hash: str) -> Article: # create document doc = self.get_document(raw_html) # article self.article._final_url = final_url self.article._link_hash = link_hash self.article._raw_html = raw_html self.article._doc = doc self.article._raw_doc = deepcopy(doc) # open graph self.article._opengraph = self.opengraph_extractor.extract() # schema.org: # - (ReportageNewsArticle) https://pending.schema.org/ReportageNewsArticle # - (NewsArticle) https://schema.org/NewsArticle # - (Article) https://schema.org/Article self.article._schema = self.schema_extractor.extract() if not self.article._final_url: if "url" in self.article.opengraph: self.article._final_url = self.article.opengraph["url"] elif self.article.schema and "url" in self.article.schema: self.article._final_url = self.article.schema["url"] # meta metas = self.metas_extractor.extract() # print(metas) self.article._meta_lang = metas['lang'] self.article._meta_favicon = metas['favicon'] self.article._meta_description = metas['description'] self.article._meta_keywords = metas['keywords'] self.article._meta_encoding = metas['encoding'] self.article._canonical_link = metas['canonical'] self.article._domain = metas['domain'] # publishdate self.article._publish_date = self.publishdate_extractor.extract() self.article._publish_datetime_utc = self._publish_date_to_utc( ) if self.article.publish_date else None # tags self.article._tags = self.tags_extractor.extract() # authors self.article._authors = self.authors_extractor.extract() # title self.article._title = self.title_extractor.extract() # jump through some hoops on attempting to get a language if not found if self.article._meta_lang is None: self.article._meta_lang = self._alternative_language_extractor() # check for known node as content body # if we find one force the article.doc to be the found node # this will prevent the cleaner to remove unwanted text content article_body = self.extractor.get_known_article_tags() if article_body is not None: doc = article_body # before we do any calcs on the body itself let's clean up the document if not isinstance(doc, list): doc = [self.cleaner.clean(doc)] else: doc = [self.cleaner.clean(deepcopy(x)) for x in doc] # big stuff self.article._top_node = self.extractor.calculate_best_node(doc) # if we do not find an article within the discovered possible article nodes, # try again with the root node. if self.article._top_node is None: # try again with the root node. self.article._top_node = self.extractor.calculate_best_node( self.article._doc) else: # set the doc member to the discovered article node. self.article._doc = doc # if we have a top node # let's process it if self.article._top_node is not None: # article links self.article._links = self.links_extractor.extract() # tweets self.article._tweets = self.tweets_extractor.extract() # video handling self.article._movies = self.video_extractor.get_videos() # image handling if self.config.enable_image_fetching: self.get_image() # post cleanup self.article._top_node = self.extractor.post_cleanup() # clean_text self.article._cleaned_text = self.formatter.get_formatted_text() # cleanup tmp file self.release_resources() # return the article return self.article @staticmethod def get_parse_candidate( crawl_candidate: CrawlCandidate) -> ParsingCandidate: if crawl_candidate.raw_html: return RawHelper.get_parsing_candidate(crawl_candidate.url, crawl_candidate.raw_html) return URLHelper.get_parsing_candidate(crawl_candidate.url) def get_image(self): doc = self.article.raw_doc top_node = self.article.top_node self.article._top_image = self.image_extractor.get_best_image( doc, top_node) def get_html(self, crawl_candidate: CrawlCandidate, parsing_candidate: ParsingCandidate) -> str: # we got a raw_tml # no need to fetch remote content if crawl_candidate.raw_html: logger.debug(f"Using raw_html for {crawl_candidate}") return crawl_candidate.raw_html # fetch HTML logger.debug(f"Fetching html from {crawl_candidate.url}") response = self.fetcher.fetch_obj(parsing_candidate.url) if response.encoding != 'ISO-8859-1': # requests has a good idea; use what it says # return response as a unicode string html = response.text self.article._meta_encoding = response.encoding else: html = response.content encodings = get_encodings_from_content(response.text) if len(encodings) > 0: self.article._meta_encoding = encodings[0] response.encoding = encodings[0] html = response.text else: self.article._meta_encoding = encodings return html def get_metas_extractor(self): return MetasExtractor(self.config, self.article) def get_publishdate_extractor(self): return PublishDateExtractor(self.config, self.article) def get_opengraph_extractor(self): return OpenGraphExtractor(self.config, self.article) def get_schema_extractor(self): return SchemaExtractor(self.config, self.article) def get_tags_extractor(self): return TagsExtractor(self.config, self.article) def get_authors_extractor(self): return AuthorsExtractor(self.config, self.article) def get_tweets_extractor(self): return TweetsExtractor(self.config, self.article) def get_links_extractor(self): return LinksExtractor(self.config, self.article) def get_title_extractor(self): return TitleExtractor(self.config, self.article) def get_image_extractor(self): return ImageExtractor(self.fetcher, self.config, self.article) def get_video_extractor(self): return VideoExtractor(self.config, self.article) def get_formatter(self): return StandardOutputFormatter(self.config, self.article) def get_cleaner(self): return StandardDocumentCleaner(self.config, self.article) def get_document(self, raw_html): doc = self.parser.fromstring(raw_html) return doc def get_extractor(self): return StandardContentExtractor(self.config, self.article) def release_resources(self): path = os.path.join(self.config.local_storage_path, '%s_*' % self.article.link_hash) for fname in glob.glob(path): try: os.remove(fname) except OSError: logger.error(f"File {fname} could not be removed") def _publish_date_to_utc(self): try: publish_datetime = dateutil.parser.parse(self.article.publish_date, tzinfos=TIMEZONE_INFO) if publish_datetime.tzinfo: return publish_datetime.astimezone(tzutc()) else: return publish_datetime except (ValueError, OverflowError): logger.warning( f"Publish date {self.article.publish_date} could not be resolved to UTC" ) return None def _alternative_language_extractor(self): tmp_lang_detect = "{} {} {} {}".format(self.article._meta_description, self.article._title, self.article._meta_keywords, self.article._tags) tmp_lang_detect = " ".join(tmp_lang_detect.split()) if len(tmp_lang_detect) > 15: # required to make it deterministic; # see: https://github.com/Mimino666/langdetect/blob/master/README.md#basic-usage DetectorFactory.seed = 0 try: return detect(tmp_lang_detect) except LangDetectException: logger.warning( "Alternative language extractor failed to extract a known language" ) return None