def _iter_metadata_triples(metadata_archive_path): """Yields all meta-data of Project Gutenberg texts contained in the catalog dump. """ is_invalid = lambda token: isinstance(token, URIRef) and ' ' in token with tarfile.open(metadata_archive_path) as metadata_archive: for item in metadata_archive: if re.match(r'^.*pg(?P<etextno>\d+).rdf$', item.name): with disable_logging(): graph = Graph().parse(metadata_archive.extractfile(item)) for fact in graph: if not any(is_invalid(token) for token in fact): yield fact else: logging.info('skipping invalid triple %s', fact)
def _iter_metadata_triples(cls, metadata_archive_path): """Yields all meta-data of Project Gutenberg texts contained in the catalog dump. """ pg_rdf_regex = re.compile(r'pg\d+.rdf$') with closing(tarfile.open(metadata_archive_path)) as metadata_archive: for item in metadata_archive: if pg_rdf_regex.search(item.name): with disable_logging(): extracted = metadata_archive.extractfile(item) graph = Graph().parse(extracted) for fact in graph: if cls._metadata_is_invalid(fact): logging.info('skipping invalid triple %s', fact) else: yield fact