def filter_record(self, warc_record, article=None): url = warc_record.rec_headers.get_header('WARC-Target-URI') url_parts = tldextract.extract(url) domain = url_parts.registered_domain if domain not in STATE_BROADCASTERS: return False, article country = STATE_BROADCASTERS[domain] passed_filters, article = super().filter_record(warc_record, article) if not passed_filters: return False, article if article is None: article = NewsPlease.from_warc(warc_record) article.country = country if not article.language or not is_european_langcode(article.language): return False, article searcher = get_covid_searchers().get(article.language) if searcher is None: return False, article def match(key): return searcher.match((getattr(article, key) or "").lower().encode("utf-8")) if not match("title") and not match("maintext"): return False, article return True, article
def filter_record(self, warc_record, article=None): passed_filters, article = super().filter_record(warc_record, article) if not passed_filters: return False, article url = warc_record.rec_headers.get_header('WARC-Target-URI') def get_lang(): nonlocal article if article is None: article = NewsPlease.from_warc(warc_record) return article.language country = detect_country(url, get_lang) if not country or not is_european_cc(country): return False, article article.country = country if article is None: article = NewsPlease.from_warc(warc_record) lang = article.language if not lang or not is_european_langcode(lang): return False, article # TODO: Find COVID-19 mention searcher = get_covid_searchers().get(lang) if searcher is None: return False, article def match(key): return searcher.match((getattr(article, key) or "").lower().encode("utf-8")) if match("title"): return True, article if match("maintext"): return True, article return True, article
def filter_record(self, warc_record, article=None): passed_filters, article = super().filter_record(warc_record, article) url = warc_record.rec_headers.get_header('WARC-Target-URI') canon_url = canonicalize_url(url) if canon_url not in all_urls: return False, article if article is None: article = NewsPlease.from_warc(warc_record) return True, article
def __filter_record(self, warc_record, article=None): """ Returns true if a record passes all tests: hosts, publishing date :param warc_record: :return: A tuple of (True or False) and an article (might be None) """ # filter by host if list is populated - empty host lists makes the process etremely slow. # seems like it is caused by the date checks if self.filter_valid_hosts: url = warc_record.rec_headers.get_header('WARC-Target-URI') # very simple check, check if one of the required host names is contained in the url of the WARC transaction # better would be to extract the host name from the WARC transaction Target URI and then check for equality # because currently something like g.co?forward_url=facebook.com would yield a positive filter test for # facebook.com even though the actual host is g.co # The below is necessary to make sure the for loop goes thru the entire list x = len(self.filter_valid_hosts) c = 0 for valid_host in self.filter_valid_hosts: c = c + 1 if valid_host in url: break else: if valid_host not in url and c == x: return False, article # filter by date if self.filter_start_date or self.filter_end_date: if not article: article = NewsPlease.from_warc(warc_record) publishing_date = self.__get_publishing_date(article) if not publishing_date: if self.filter_strict_date: return False, article else: # here we for sure have a date # is article published too early? if self.filter_start_date: if publishing_date < self.filter_start_date: return False, article if self.filter_end_date < publishing_date: return False, article get_desc_data = self.__get_description_data(article) if not get_desc_data: return False, article else: if self.filter_text not in get_desc_data: return False, article return True, article
def __process_warc_gz_file(self, path_name): """ Iterates all transactions in one WARC file and for each transaction tries to extract an article object. Afterwards, each article is checked against the filter criteria and if all are passed, the function on_valid_article_extracted is invoked with the article object. :param path_name: :return: """ counter_article_total = 0 counter_article_passed = 0 counter_article_discarded = 0 start_time = time.time() with open(path_name, 'rb') as stream: for record in ArchiveIterator(stream): try: if record.rec_type == 'response': counter_article_total += 1 # if the article passes filter tests, we notify the user filter_pass, article = self.__filter_record(record) if filter_pass: counter_article_passed += 1 if not article: article = NewsPlease.from_warc(record) self.logger.info('article pass (%s; %s; %s)', article.sourceDomain, article.publish_date, article.title) self.on_valid_article_extracted(article) else: counter_article_discarded += 1 if article: self.logger.info('article discard (%s; %s; %s)', article.sourceDomain, article.publish_date, article.title) else: self.logger.info('article discard (%s)', record.rec_headers.get_header('WARC-Target-URI')) if counter_article_total % 10 == 0: elapsed_secs = time.time() - start_time secs_per_article = elapsed_secs / counter_article_total self.logger.info('statistics') self.logger.info('pass = %i, discard = %i, total = %i', counter_article_passed, counter_article_discarded, counter_article_total) self.logger.info('extraction from current WARC file started %s; %f s/article', human(start_time), secs_per_article) except: if self.continue_after_error: self.logger.error('Unexpected error: %s', sys.exc_info()[0]) pass else: raise
def process_warc_record(record): try: if record.rec_type == 'response': article = NewsPlease.from_warc(record) if article is not None: return json.dumps(article.__dict__, default=str, separators=(',', ':')) except Exception as e: log = logging.getLogger() log.warning('skipping record due to Exception: ' + str(e)) return None
def process_warc_file(self, path_name): """ Iterates all transactions in one WARC file and for each transaction tries to extract an article object. Afterwards, each article is checked against the filter criteria and if all are passed, the function save_article is invoked with the article object. :param path_name: :return: """ total = 0 passed = 0 discarded = 0 error = 0 start_time = time.time() with open(path_name, "rb") as stream: for record in ArchiveIterator(stream): if record.rec_type == "warcinfo": logger.info(record.raw_stream.read()) continue elif record.rec_type != "response": logger.warning("WARC-Type: is not response") continue total += 1 # if the article passes filter tests, we notify the user if self.is_wanted_record(record): passed += 1 article = NewsPlease.from_warc(record) self.process_article(article) else: discarded += 1 logger.debug( "article discard: %s)", record.rec_headers.get_header("WARC-Target-URI"), ) if total % 100 == 0: logger.info( "pass = %i, discard = %i, error = %i, total = %i", passed, discarded, error, total, ) secs_per_article = (time.time() - start_time) / total logger.info(f"extracting WARC {secs_per_article} s/article") self.downloaded_urls.append(self.url)
def __filter_record(self, warc_record, article=None): """ Returns true if a record passes all tests: hosts, publishing date :param warc_record: :return: A tuple of (True or False) and an article (might be None) """ # filter by host if self.__filter_valid_hosts: url = warc_record.rec_headers.get_header('WARC-Target-URI') # very simple check, check if one of the required host names is contained in the url of the WARC transaction # better would be to extract the host name from the WARC transaction Target URI and then check for equality # because currently something like g.co?forward_url=facebook.com would yield a positive filter test for # facebook.com even though the actual host is g.co for valid_host in self.__filter_valid_hosts: if valid_host in url: break else: return False, article # filter by date if self.__filter_start_date or self.__filter_end_date: if not article: article = NewsPlease.from_warc(warc_record) publishing_date = self.__get_publishing_date(warc_record, article) if not publishing_date: if self.__filter_strict_date: return False, article else: # here we for sure have a date # is article published too early? if self.__filter_start_date: if publishing_date < self.__filter_start_date: return False, article if self.__filter_end_date: if publishing_date > self.__filter_end_date: return False, article return True, article
def __process_warc_gz_file(self, path_name): """ Iterates all transactions in one WARC file and for each transaction tries to extract an article object. Afterwards, each article is checked against the filter criteria and if all are passed, the function on_valid_article_extracted is invoked with the article object. :param path_name: :return: """ counter_article_total = 0 counter_article_passed = 0 counter_article_discarded = 0 start_time = time.time() with open(path_name, 'rb') as stream: # opens a file and returns a stream 'rb' = read/binary for record in ArchiveIterator(stream): try: # Every WARC record shall have a type, reported in the WARC-Type field. There are eight WARC record # types: 'warcinfo', 'response', 'resource', 'request', 'metadata', 'revisit', 'conversion', # and 'continuation'. if record.rec_type == 'response': counter_article_total += 1 # if the article passes filter tests, we notify the user # this calls the filter function and returns a True / false and the article filter_pass, article = self.__filter_record(record) if filter_pass: counter_article_passed += 1 if not article: article = NewsPlease.from_warc(record) self.logger.info('article pass (%s; %s; %s)', article.source_domain, article.date_publish, article.title) self.on_valid_article_extracted(article) else: counter_article_discarded += 1 if article: self.logger.info( 'article discard (%s; %s; %s)', article.source_domain, article.date_publish, article.title) else: self.logger.info( 'article discard (%s)', record.rec_headers.get_header( 'WARC-Target-URI')) if counter_article_total % 10 == 0: elapsed_secs = time.time() - start_time secs_per_article = elapsed_secs / counter_article_total self.logger.info('statistics') self.logger.warning( 'pass = %i, discard = %i, total = %i', counter_article_passed, counter_article_discarded, counter_article_total) self.logger.warning( 'extraction from current WARC file started %s; %f s/article', human(start_time), secs_per_article) except: if self.continue_after_error: self.logger.error('Unexpected error: %s', sys.exc_info()[0]) pass else: raise
def get_lang(): nonlocal article if article is None: article = NewsPlease.from_warc(warc_record) return article.language