Example #1
0
def _reset_metadata_graph():
    """Removes all traces of the persistent RDF graph.

    """
    global _METADATA_DATABASE_SINGLETON
    _METADATA_DATABASE_SINGLETON = None
    remove(_METADATA_CACHE)
Example #2
0
def load_etext(etextno, refresh_cache=False, mirror=None, prefer_ascii=False):
    """Returns a unicode representation of the full body of a Project Gutenberg
    text. After making an initial remote call to Project Gutenberg's servers,
    the text is persisted locally.

    """
    etextno = validate_etextno(etextno)
    cached = os.path.join(_TEXT_CACHE, '{}.txt.gz'.format(etextno))

    if refresh_cache:
        remove(cached)
    if not os.path.exists(cached):
        makedirs(os.path.dirname(cached))
        download_uri = _format_download_uri(etextno, mirror, prefer_ascii)
        response = requests.get(download_uri)
        # Ensure proper UTF-8 saving. There might be instances of ebooks or
        # mirrors which advertise a broken encoding, and this will break
        # downstream usages. For example, #55517 from aleph.gutenberg.org:
        #
        # from gutenberg.acquire import load_etext
        # print(load_etext(55517, refresh_cache=True)[0:1000])
        #
        # response.encoding will be 'ISO-8859-1' while the file is UTF-8
        if response.encoding != response.apparent_encoding:
            response.encoding = response.apparent_encoding
        text = response.text
        with closing(gzip.open(cached, 'w')) as cache:
            cache.write(text.encode('utf-8'))

    with closing(gzip.open(cached, 'r')) as cache:
        text = cache.read().decode('utf-8')
    return text
Example #3
0
def _reset_metadata_graph():
    """Removes all traces of the persistent RDF graph.

    """
    global _METADATA_DATABASE_SINGLETON
    _METADATA_DATABASE_SINGLETON = None
    remove(_METADATA_CACHE)
Example #4
0
def load_etext(etextno, refresh_cache=False, mirror=None, prefer_ascii=False):
    """Returns a unicode representation of the full body of a Project Gutenberg
    text. After making an initial remote call to Project Gutenberg's servers,
    the text is persisted locally.
    """
    etextno = validate_etextno(etextno)
    cached = os.path.join(_TEXT_CACHE, '{}.txt.gz'.format(etextno))

    if refresh_cache:
        remove(cached)
    if not os.path.exists(cached):
        makedirs(os.path.dirname(cached))
        download_uri = _format_download_uri(etextno, mirror, prefer_ascii)
        response = requests.get(download_uri)
        # Ensure proper UTF-8 saving. There might be instances of ebooks or
        # mirrors which advertise a broken encoding, and this will break
        # downstream usages. For example, #55517 from aleph.gutenberg.org:
        #
        # from gutenberg.acquire import load_etext
        # print(load_etext(55517, refresh_cache=True)[0:1000])
        #
        # response.encoding will be 'ISO-8859-1' while the file is UTF-8
        if response.encoding != response.apparent_encoding:
            response.encoding = response.apparent_encoding
        text = response.text
        with closing(gzip.open(cached, 'w')) as cache:
            cache.write(text.encode('utf-8'))

    with closing(gzip.open(cached, 'r')) as cache:
        text = cache.read().decode('utf-8')
    return text


# def _main():
#     """Command line interface to the module.
#     """
#     from argparse import ArgumentParser, FileType
#     from gutenberg import Error
#     from gutenberg._util.os import reopen_encoded

#     parser = ArgumentParser(description='Download a Project Gutenberg text')
#     parser.add_argument('etextno', type=int)
#     parser.add_argument('outfile', type=FileType('w'))
#     parser.add_argument('--mirror', '-m', type=str, default=None)
#     parser.add_argument('--prefer-ascii', '-a', type=bool, default=False)
#     args = parser.parse_args()

#     try:
#         text = load_etext(args.etextno,
#                           mirror=args.mirror,
#                           prefer_ascii=args.prefer_ascii)
#         with reopen_encoded(args.outfile, 'w', 'utf8') as outfile:
#             outfile.write(text)
#     except Error as error:
#         parser.error(str(error))

# if __name__ == '__main__':
#     _main()
Example #5
0
    def _download_metadata_archive(self):
        """Makes a remote call to the Project Gutenberg servers and downloads
        the entire Project Gutenberg meta-data catalog. The catalog describes
        the texts on Project Gutenberg in RDF. The function returns a
        file-pointer to the catalog.

        """
        with tempfile.NamedTemporaryFile(delete=False) as metadata_archive:
            shutil.copyfileobj(urlopen(self.catalog_source), metadata_archive)
        yield metadata_archive.name
        remove(metadata_archive.name)
Example #6
0
    def _download_metadata_archive(self):
        """Makes a remote call to the Project Gutenberg servers and downloads
        the entire Project Gutenberg meta-data catalog. The catalog describes
        the texts on Project Gutenberg in RDF. The function returns a
        file-pointer to the catalog.

        """
        with tempfile.NamedTemporaryFile(delete=False) as metadata_archive:
            shutil.copyfileobj(urlopen(self.catalog_source), metadata_archive)
        yield metadata_archive.name
        remove(metadata_archive.name)
Example #7
0
def _download_metadata_archive():
    """Makes a remote call to the Project Gutenberg servers and downloads the
    entire Project Gutenberg meta-data catalog. The catalog describes the texts
    on Project Gutenberg in RDF. The function returns a file-pointer to the
    catalog.

    """
    data_url = r'http://www.gutenberg.org/cache/epub/feeds/rdf-files.tar.bz2'
    with tempfile.NamedTemporaryFile(delete=False) as metadata_archive:
        shutil.copyfileobj(urllib2.urlopen(data_url), metadata_archive)
    yield metadata_archive.name
    remove(metadata_archive.name)
Example #8
0
def _download_metadata_archive():
    """Makes a remote call to the Project Gutenberg servers and downloads the
    entire Project Gutenberg meta-data catalog. The catalog describes the texts
    on Project Gutenberg in RDF. The function returns a file-pointer to the
    catalog.

    """
    data_url = r'http://www.gutenberg.org/cache/epub/feeds/rdf-files.tar.bz2'
    with tempfile.NamedTemporaryFile(delete=False) as metadata_archive:
        shutil.copyfileobj(urllib2.urlopen(data_url), metadata_archive)
    yield metadata_archive.name
    remove(metadata_archive.name)
Example #9
0
def load_etext(etextno, refresh_cache=False):
    """Returns a unicode representation of the full body of a Project Gutenberg
    text. After making an initial remote call to Project Gutenberg's servers,
    the text is persisted locally.

    """
    etextno = validate_etextno(etextno)
    cached = os.path.join(_TEXT_CACHE, '{0}.txt.gz'.format(etextno))

    if refresh_cache:
        remove(cached)
    if not os.path.exists(cached):
        makedirs(os.path.dirname(cached))
        download_uri = _format_download_uri(etextno)
        response = requests.get(download_uri)
        text = response.text
        with closing(gzip.open(cached, 'w')) as cache:
            cache.write(text.encode('utf-8'))

    with closing(gzip.open(cached, 'r')) as cache:
        text = cache.read().decode('utf-8')
    return text
Example #10
0
def load_etext(etextno, refresh_cache=False):
    """Returns a unicode representation of the full body of a Project Gutenberg
    text. After making an initial remote call to Project Gutenberg's servers,
    the text is persisted locally.

    """
    etextno = validate_etextno(etextno)
    cached = os.path.join(_TEXT_CACHE, '{0}.txt.gz'.format(etextno))

    if refresh_cache:
        remove(cached)
    if not os.path.exists(cached):
        makedirs(os.path.dirname(cached))
        download_uri = _format_download_uri(etextno)
        response = requests.get(download_uri)
        response.encoding = 'utf-8'
        text = response.text
        with contextlib.closing(gzip.open(cached, 'w')) as cache:
            cache.write(text.encode('utf-8'))
    else:
        with contextlib.closing(gzip.open(cached, 'r')) as cache:
            text = cache.read().decode('utf-8')
    return text
Example #11
0
    def delete(self):
        """Delete the cache.

        """
        self.close()
        remove(self._local_storage_path)
Example #12
0
 def tearDown(self):
     try:
         remove(self.temporary_path)
     except OSError:
         pass
Example #13
0
    def delete(self):
        """Delete the cache.

        """
        self.close()
        remove(self._local_storage_path)
Example #14
0
 def tearDown(self):
     remove(self.temporary_path)
Example #15
0
 def tearDown(self):
     try:
         remove(self.temporary_path)
     except OSError:
         pass
Example #16
0
 def test_remove(self):
     for path in (self.temporary_file, self.temporary_directory):
         self.assertTrue(os.path.exists(path))
         remove(path)
         self.assertFalse(os.path.exists(path))
Example #17
0
 def test_remove(self):
     for path in (self.temporary_file, self.temporary_directory):
         self.assertTrue(os.path.exists(path))
         remove(path)
         self.assertFalse(os.path.exists(path))