Example #1
0
def _iter_metadata_triples(metadata_archive_path):
    """Yields all meta-data of Project Gutenberg texts contained in the catalog
    dump.

    """
    is_invalid = lambda token: isinstance(token, URIRef) and ' ' in token
    with tarfile.open(metadata_archive_path) as metadata_archive:
        for item in metadata_archive:
            if re.match(r'^.*pg(?P<etextno>\d+).rdf$', item.name):
                with disable_logging():
                    graph = Graph().parse(metadata_archive.extractfile(item))
                for fact in graph:
                    if not any(is_invalid(token) for token in fact):
                        yield fact
                    else:
                        logging.info('skipping invalid triple %s', fact)
Example #2
0
def _iter_metadata_triples(metadata_archive_path):
    """Yields all meta-data of Project Gutenberg texts contained in the catalog
    dump.

    """
    is_invalid = lambda token: isinstance(token, URIRef) and ' ' in token
    with tarfile.open(metadata_archive_path) as metadata_archive:
        for item in metadata_archive:
            if re.match(r'^.*pg(?P<etextno>\d+).rdf$', item.name):
                with disable_logging():
                    graph = Graph().parse(metadata_archive.extractfile(item))
                for fact in graph:
                    if not any(is_invalid(token) for token in fact):
                        yield fact
                    else:
                        logging.info('skipping invalid triple %s', fact)
Example #3
0
    def _iter_metadata_triples(cls, metadata_archive_path):
        """Yields all meta-data of Project Gutenberg texts contained in the
        catalog dump.

        """
        pg_rdf_regex = re.compile(r'pg\d+.rdf$')
        with closing(tarfile.open(metadata_archive_path)) as metadata_archive:
            for item in metadata_archive:
                if pg_rdf_regex.search(item.name):
                    with disable_logging():
                        extracted = metadata_archive.extractfile(item)
                        graph = Graph().parse(extracted)
                    for fact in graph:
                        if cls._metadata_is_invalid(fact):
                            logging.info('skipping invalid triple %s', fact)
                        else:
                            yield fact
Example #4
0
    def _iter_metadata_triples(cls, metadata_archive_path):
        """Yields all meta-data of Project Gutenberg texts contained in the
        catalog dump.

        """
        pg_rdf_regex = re.compile(r'pg\d+.rdf$')
        with closing(tarfile.open(metadata_archive_path)) as metadata_archive:
            for item in metadata_archive:
                if pg_rdf_regex.search(item.name):
                    with disable_logging():
                        extracted = metadata_archive.extractfile(item)
                        graph = Graph().parse(extracted)
                    for fact in graph:
                        if cls._metadata_is_invalid(fact):
                            logging.info('skipping invalid triple %s', fact)
                        else:
                            yield fact