def search(title_list=None, authors_list=None):
    """
    Receives two lists of books to download.
    Searches for their availability on Gutenberg and returns list of gutenberg IDs.
    """
    if title_list is None:
        title_list = []
    if authors_list is None:
        authors_list = []

    book_list = set()

    for title in title_list:
        found_texts = list(get_etexts("title", title))
        if found_texts:
            print(f"Found {title}")
            for id_number in found_texts:
                book_list.add(id_number)

    for author in authors_list:
        found_texts = list(get_etexts("author", author))
        if found_texts:
            print(f"Found {author}")
            for id_number in found_texts:
                book_list.add(id_number)

    return list(book_list)
def trial():
    text = strip_headers(load_etext(2701)).strip()
    print(text)  # prints 'MOBY DICK; OR THE WHALE\n\nBy Herman Melville ...'
    print(get_metadata(
        'title', 2701))  # prints frozenset([u'Moby Dick; Or, The Whale'])
    print(get_metadata('author',
                       2701))  # prints frozenset([u'Melville, Hermann'])

    print(get_etexts(
        'title', 'Moby Dick; Or, The Whale'))  # prints frozenset([2701, ...])
    print(get_etexts('author',
                     'Melville, Herman'))  # prints frozenset([2701, ...])
Ejemplo n.º 3
0
    def load_gutenberg(self, language='en'):
        texts = get_etexts('author', self.author)
        texts = {
            t: list(get_metadata("title", t))[0]
            for t in texts if list(get_metadata("language", t))[0] == language
        }

        new_texts = dict()
        dupes = list()
        for k, d in texts.items():
            d = d.replace("\r\n", " ")
            if d not in dupes:
                dupes.append(d)
                new_texts[k] = d
                try:
                    self.books[d] = strip_headers(
                        load_etext(k)).strip().split("\r\n\r\n")
                except UnknownDownloadUriException:
                    print(
                        f'Book "{d}" does not have a text format and was not loaded.'
                    )
                    del new_texts[k]
                    dupes.remove(d)
                    continue
                self.tokens[d] = [
                    nltk.pos_tag(nltk.word_tokenize(self.books[d][b]))
                    for b in range(len(self.books[d]))
                ]
            else:
                pass

        texts = new_texts

        print(texts)
def init_books(author_file, json_file):
    """initialize book list with texts and save it to disk"""
    with open(author_file) as f:
        authors = list(f)

    authors = [i.strip() for i in authors]

    books = []
    for author in authors:
        s = get_etexts('author', author)
        for i in s:
            try:
                if list(get_metadata('language', i))[0] == 'en':
                    title, etext = list(get_metadata(
                        'title', i))[0], strip_headers(load_etext(i)).strip()
                    b = Book(i, title, etext)
                    books.append(b)
            except UnknownDownloadUriException:
                # this book does not have a load_etext corresponding to it.
                pass

    with open(json_file, 'wb') as f:
        pickle.dump(books, f)

    print(len(books))
Ejemplo n.º 5
0
def get_joyce_texts():
    joyce_keys = get_etexts('author', 'Joyce, James')
    joyce_titles = []
    joyce_texts = {}
    for key in joyce_keys:
        joyce_titles.append(get_metadata('title', key))
        joyce_texts[key] = strip_headers(load_etext(key)).strip()
    return (joyce_texts)
Ejemplo n.º 6
0
    def get_text(self, title, author):
        """
		This function will access the title and author of a book from the
		Gutenberg project and save the data as a csv file
		PROBLEM HERE -- gutenberg goes down a lot, so getting a full text 
		did not work. To bypass that, I downloaded some books of mixed languages.
		"""
        guten_number = get_etexts('title', title)[0]
        text = strip_headers(load_etext(guten_number)).strip()
        return (text)
Ejemplo n.º 7
0
 def _run_get_etexts_for_feature(self, feature):
     for testcase in self.sample_data():
         for feature_value in getattr(testcase, feature):
             actual = get_etexts(feature, feature_value)
             self.assertTrue(
                 testcase.etextno in actual,
                 u("didn't retrieve {etextno} when querying for books that "
                   'have {feature}="{feature_value}" (got {actual}).').
                 format(etextno=testcase.etextno,
                        feature=feature,
                        feature_value=feature_value,
                        actual=actual))
Ejemplo n.º 8
0
 def _run_get_etexts_for_feature(self, feature):
     for testcase in self.sample_data():
         for feature_value in getattr(testcase, feature):
             actual = get_etexts(feature, feature_value)
             self.assertTrue(
                 testcase.etextno in actual,
                 u("didn't retrieve {etextno} when querying for books that "
                   'have {feature}="{feature_value}" (got {actual}).')
                 .format(
                     etextno=testcase.etextno,
                     feature=feature,
                     feature_value=feature_value,
                     actual=actual))
Ejemplo n.º 9
0
def search(query: str, include: Optional[str] = None) -> List[dict]:
    fields = parse_include(include) if include else []
    conjunction = parse_search(query)

    parts = iter(get_etexts(field, value) for field, value in conjunction)
    results = set(next(parts))
    [results.intersection_update(part) for part in parts]  # type: ignore

    return [
        dict([('text_id', text_id)] + [(field, get_metadata(field, text_id))
                                       for field in fields])
        for text_id in results
    ]
Ejemplo n.º 10
0
 def _run_get_etexts_for_feature(self, feature):
     for testcase in self.sample_data():
         for feature_value in getattr(testcase, feature):
             actual = get_etexts(feature, feature_value)
             if testcase.is_phantom:
                 self.assertNotIn(testcase.etextno, actual)
             else:
                 self.assertIn(
                     testcase.etextno, actual,
                     "didn't retrieve {etextno} when querying for books "
                     'that have {feature}="{feature_value}" (got {actual}).'
                     .format(etextno=testcase.etextno,
                             feature=feature,
                             feature_value=feature_value,
                             actual=actual))
def get_books_by_lang():
    try:
        bookids = list(get_etexts('language', 'fr'))
        if args.random:
            shuffle(bookids)
        return bookids
    except InvalidCacheException:
        print("""
    You need to create a Gutenberg cache first:
    Run those in your venv:

python -c 'from gutenberg.acquire import get_metadata_cache; get_metadata_cache().populate();'

    It might take a few hours.
                """)
        return list()
Ejemplo n.º 12
0
 def _run_get_etexts_for_feature(self, feature):
     for testcase in self.sample_data():
         for feature_value in getattr(testcase, feature):
             actual = get_etexts(feature, feature_value)
             if testcase.is_phantom:
                 self.assertNotIn(testcase.etextno, actual)
             else:
                 self.assertIn(
                     testcase.etextno,
                     actual,
                     "didn't retrieve {etextno} when querying for books "
                     'that have {feature}="{feature_value}" (got {actual}).'
                     .format(
                         etextno=testcase.etextno,
                         feature=feature,
                         feature_value=feature_value,
                         actual=actual))
Ejemplo n.º 13
0
def count_words(args: argparse.Namespace) -> None:
    """Count the words in all Gutenberg books for a given language."""
    # Pull the list of book IDs
    if not args.quiet:
        print("Processing Project Gutenberg books...")
    etexts = get_etexts("language", args.language)
    etexts_iter = tqdm.tqdm(list(etexts)) if not args.quiet else etexts

    # Load each book and count the words
    word_counts = collections.Counter()
    etexts = []
    failed_etexts = []
    for i, etext in enumerate(etexts_iter):
        try:
            etexts.append(load_etext_from_cache(etext))
        except GutenbergError as e:
            failed_etexts.append(etext)
            print("Failure: ", e)
            continue
        # For efficiency, only periodically turn the texts into word counts
        if i % PROCESS_CHUNK_SIZE == 0:
            word_counts += _count_words_in_etexts(etexts)
            etexts = []
            # Also trim the least common words, since they're usually
            # gibberish and it's helpful to keep memory pressure down
            word_counts = collections.Counter(
                dict(word_counts.most_common(MAX_WORD_COUNT_LENGTH))
            )
    word_counts += _count_words_in_etexts(etexts)
    del word_counts[""]

    # Output the word counts to a file
    if not args.quiet:
        print(
            f"Failed to download {len(failed_etexts)} books. (A few of these are "
            "normal, as some books have no text.)"
        )
        print(f'--- Failed: {", ".join(str(etext) for etext in failed_etexts)}')
        print("Writing word counts to disk...")
    _output_word_counts(word_counts, args.output)
    if not args.quiet:
        print(f"Done! See word counts in {args.output}.")
Ejemplo n.º 14
0
def prime_text_cache(args: argparse.Namespace) -> None:
    """
    Primes the Project Gutenberg text cache so text retrieval is entirely local.

    This will download all Gutenberg book texts onto your local machine, which
    will take many hours and ~10-20GB.
    """
    if not args.quiet:
        print("Downloading Project Gutenberg book texts...")
    etexts = get_etexts("language", args.language)
    # Cycle through mirrors so as not to overload anyone's servers and get rate-limited
    etexts_with_mirrors = list(zip(etexts, itertools.cycle(MIRRORS)))
    etexts_iter = (
        tqdm.tqdm(etexts_with_mirrors) if not args.quiet else etexts_with_mirrors
    )

    success_count = 0
    total_count = 0
    try:
        for etext, mirror in etexts_iter:
            total_count += 1
            try:
                load_etext(etext, mirror=mirror)
                success_count += 1
            except GutenbergError as e:
                if not args.quiet:
                    print(f"Failure (mirror: {mirror}) ", e)
                continue
    except KeyboardInterrupt:
        pass
    except Exception:
        print("Error with mirror: ", mirror, etext)
        raise

    if not args.quiet:
        print(f"{success_count} / {total_count} books downloaded to cache")
        print("Done!")
"""
Created on Wed Aug 12 18:06:45 2015

@author: Tony
Description: Pull etext numbers from Project Gutenberg for an author

1) First pip install gutenberg 0.4.0 library for Python from the command line

"""
 
from gutenberg.query import get_etexts
from gutenberg.query import get_metadata
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers


# get the catalogue numbers of all the texts
# by Wilhelm Grimm in Project Gutenberg
bookList=get_etexts('author', 'Grimm, Wilhelm Carl')
# gives bookList = [12704, 12705, 12706, 12707, 12708, 12709, 12710, 37381, 20051, 28044, 30510, 22555, 20050, 11027, 16846, 12250, 20027, 19068, 2591]

#Once We can associate a number with a title we can pull the text
for number in bookList:
    print(number,get_metadata('title',number))
 
print('\n HHHHHHHHHHHHHHH Now for the full text HHHHHHHHHHHHHHHHHHH \n')
# Once we have the text number we can print the text
# example 11027 is the number for Grimm's Fairy Stories 
# can be tempermental truncating text at top (console limit?) may need to trick around  
etext = strip_headers(load_etext(11027)).strip()
print(etext)
Ejemplo n.º 16
0
# -*- coding: utf-8 -*-
"""
Created on Sun Sep 20 13:05:59 2015

@author: weizhi
"""

from gutenberg.query import get_etexts
from gutenberg.query import get_metadata

print(get_metadata('title', 2701))  # prints 'Moby Dick; Or, The Whale'
print(get_metadata('author', 2701))  # prints 'Melville, Hermann'

print(get_etexts('title', 'Moby Dick; Or, The Whale'))  # prints (2701, ...)
print(get_etexts('author', 'Melville, Hermann'))  # prints (2701, ...)
Ejemplo n.º 17
0
        return URIRef(value)


class LanguageExtractor(_SimplePredicateRelationshipExtractor):
    """Extracts book languages.

    """
    @classmethod
    def feature_name(cls):
        return 'language'

    @classmethod
    def predicate(cls):
        return DCTERMS.language / RDF.value

    @classmethod
    def contains(cls, value):
        return Literal(value)


if __name__ == '__main__':
    from gutenberg.acquire.metadata import set_metadata_cache, SleepycatMetadataCache
    cache = SleepycatMetadataCache('/Users/deanjones/gutenberg_data')
    set_metadata_cache(cache)

    from gutenberg.query import get_etexts
    # texts = gutenberg.query.api.get_etexts('language', 'en')
    # print len(texts)

    print get_etexts('language', 'en')
Ejemplo n.º 18
0
def search_by_title(title):
    result = q.get_etexts('title', title)
    return list(result)
Ejemplo n.º 19
0
# -*- coding: utf-8 -*-
"""
Created on Sun Sep 20 13:05:59 2015

@author: weizhi
"""

from gutenberg.query import get_etexts
from gutenberg.query import get_metadata

print(get_metadata('title', 2701))  # prints 'Moby Dick; Or, The Whale'
print(get_metadata('author', 2701)) # prints 'Melville, Hermann'

print(get_etexts('title', 'Moby Dick; Or, The Whale'))  # prints (2701, ...)
print(get_etexts('author', 'Melville, Hermann'))        # prints (2701, ...)
Ejemplo n.º 20
0
print("Hello. Welcome to the Gutenberg Analyser.")
print("We begin by downloading the relevant texts for each author.")
print("\n\n\n")

print("Enter the number of authors whose works you want to download: ")
n = int(input())

for j in range(n):
    print(
        "Enter the name of the author. Please make sure that string that you enter matches the author name on Project Gutenberg exactly"
    )
    author = input()
    print("Name entered by you is: ", author)

    print("Loading books.....")
    originalList = (get_etexts('author', author))
    dictionaryOfNames = OrderedDict(
    )  #contains names of the books and language of the book
    listOfTexts = []  #contains book number

    for i in originalList:
        try:
            text = strip_headers(load_etext(i)).strip()
            title = set(get_metadata('title', i))
            lanugage = set(get_metadata('language', i))
            dictionaryOfNames[title.pop()] = lanugage.pop()
            listOfTexts.append(i)
        except:
            pass
            #print("error found in download number",i)