def search(title_list=None, authors_list=None): """ Receives two lists of books to download. Searches for their availability on Gutenberg and returns list of gutenberg IDs. """ if title_list is None: title_list = [] if authors_list is None: authors_list = [] book_list = set() for title in title_list: found_texts = list(get_etexts("title", title)) if found_texts: print(f"Found {title}") for id_number in found_texts: book_list.add(id_number) for author in authors_list: found_texts = list(get_etexts("author", author)) if found_texts: print(f"Found {author}") for id_number in found_texts: book_list.add(id_number) return list(book_list)
def trial(): text = strip_headers(load_etext(2701)).strip() print(text) # prints 'MOBY DICK; OR THE WHALE\n\nBy Herman Melville ...' print(get_metadata( 'title', 2701)) # prints frozenset([u'Moby Dick; Or, The Whale']) print(get_metadata('author', 2701)) # prints frozenset([u'Melville, Hermann']) print(get_etexts( 'title', 'Moby Dick; Or, The Whale')) # prints frozenset([2701, ...]) print(get_etexts('author', 'Melville, Herman')) # prints frozenset([2701, ...])
def load_gutenberg(self, language='en'): texts = get_etexts('author', self.author) texts = { t: list(get_metadata("title", t))[0] for t in texts if list(get_metadata("language", t))[0] == language } new_texts = dict() dupes = list() for k, d in texts.items(): d = d.replace("\r\n", " ") if d not in dupes: dupes.append(d) new_texts[k] = d try: self.books[d] = strip_headers( load_etext(k)).strip().split("\r\n\r\n") except UnknownDownloadUriException: print( f'Book "{d}" does not have a text format and was not loaded.' ) del new_texts[k] dupes.remove(d) continue self.tokens[d] = [ nltk.pos_tag(nltk.word_tokenize(self.books[d][b])) for b in range(len(self.books[d])) ] else: pass texts = new_texts print(texts)
def init_books(author_file, json_file): """initialize book list with texts and save it to disk""" with open(author_file) as f: authors = list(f) authors = [i.strip() for i in authors] books = [] for author in authors: s = get_etexts('author', author) for i in s: try: if list(get_metadata('language', i))[0] == 'en': title, etext = list(get_metadata( 'title', i))[0], strip_headers(load_etext(i)).strip() b = Book(i, title, etext) books.append(b) except UnknownDownloadUriException: # this book does not have a load_etext corresponding to it. pass with open(json_file, 'wb') as f: pickle.dump(books, f) print(len(books))
def get_joyce_texts(): joyce_keys = get_etexts('author', 'Joyce, James') joyce_titles = [] joyce_texts = {} for key in joyce_keys: joyce_titles.append(get_metadata('title', key)) joyce_texts[key] = strip_headers(load_etext(key)).strip() return (joyce_texts)
def get_text(self, title, author): """ This function will access the title and author of a book from the Gutenberg project and save the data as a csv file PROBLEM HERE -- gutenberg goes down a lot, so getting a full text did not work. To bypass that, I downloaded some books of mixed languages. """ guten_number = get_etexts('title', title)[0] text = strip_headers(load_etext(guten_number)).strip() return (text)
def _run_get_etexts_for_feature(self, feature): for testcase in self.sample_data(): for feature_value in getattr(testcase, feature): actual = get_etexts(feature, feature_value) self.assertTrue( testcase.etextno in actual, u("didn't retrieve {etextno} when querying for books that " 'have {feature}="{feature_value}" (got {actual}).'). format(etextno=testcase.etextno, feature=feature, feature_value=feature_value, actual=actual))
def _run_get_etexts_for_feature(self, feature): for testcase in self.sample_data(): for feature_value in getattr(testcase, feature): actual = get_etexts(feature, feature_value) self.assertTrue( testcase.etextno in actual, u("didn't retrieve {etextno} when querying for books that " 'have {feature}="{feature_value}" (got {actual}).') .format( etextno=testcase.etextno, feature=feature, feature_value=feature_value, actual=actual))
def search(query: str, include: Optional[str] = None) -> List[dict]: fields = parse_include(include) if include else [] conjunction = parse_search(query) parts = iter(get_etexts(field, value) for field, value in conjunction) results = set(next(parts)) [results.intersection_update(part) for part in parts] # type: ignore return [ dict([('text_id', text_id)] + [(field, get_metadata(field, text_id)) for field in fields]) for text_id in results ]
def _run_get_etexts_for_feature(self, feature): for testcase in self.sample_data(): for feature_value in getattr(testcase, feature): actual = get_etexts(feature, feature_value) if testcase.is_phantom: self.assertNotIn(testcase.etextno, actual) else: self.assertIn( testcase.etextno, actual, "didn't retrieve {etextno} when querying for books " 'that have {feature}="{feature_value}" (got {actual}).' .format(etextno=testcase.etextno, feature=feature, feature_value=feature_value, actual=actual))
def get_books_by_lang(): try: bookids = list(get_etexts('language', 'fr')) if args.random: shuffle(bookids) return bookids except InvalidCacheException: print(""" You need to create a Gutenberg cache first: Run those in your venv: python -c 'from gutenberg.acquire import get_metadata_cache; get_metadata_cache().populate();' It might take a few hours. """) return list()
def _run_get_etexts_for_feature(self, feature): for testcase in self.sample_data(): for feature_value in getattr(testcase, feature): actual = get_etexts(feature, feature_value) if testcase.is_phantom: self.assertNotIn(testcase.etextno, actual) else: self.assertIn( testcase.etextno, actual, "didn't retrieve {etextno} when querying for books " 'that have {feature}="{feature_value}" (got {actual}).' .format( etextno=testcase.etextno, feature=feature, feature_value=feature_value, actual=actual))
def count_words(args: argparse.Namespace) -> None: """Count the words in all Gutenberg books for a given language.""" # Pull the list of book IDs if not args.quiet: print("Processing Project Gutenberg books...") etexts = get_etexts("language", args.language) etexts_iter = tqdm.tqdm(list(etexts)) if not args.quiet else etexts # Load each book and count the words word_counts = collections.Counter() etexts = [] failed_etexts = [] for i, etext in enumerate(etexts_iter): try: etexts.append(load_etext_from_cache(etext)) except GutenbergError as e: failed_etexts.append(etext) print("Failure: ", e) continue # For efficiency, only periodically turn the texts into word counts if i % PROCESS_CHUNK_SIZE == 0: word_counts += _count_words_in_etexts(etexts) etexts = [] # Also trim the least common words, since they're usually # gibberish and it's helpful to keep memory pressure down word_counts = collections.Counter( dict(word_counts.most_common(MAX_WORD_COUNT_LENGTH)) ) word_counts += _count_words_in_etexts(etexts) del word_counts[""] # Output the word counts to a file if not args.quiet: print( f"Failed to download {len(failed_etexts)} books. (A few of these are " "normal, as some books have no text.)" ) print(f'--- Failed: {", ".join(str(etext) for etext in failed_etexts)}') print("Writing word counts to disk...") _output_word_counts(word_counts, args.output) if not args.quiet: print(f"Done! See word counts in {args.output}.")
def prime_text_cache(args: argparse.Namespace) -> None: """ Primes the Project Gutenberg text cache so text retrieval is entirely local. This will download all Gutenberg book texts onto your local machine, which will take many hours and ~10-20GB. """ if not args.quiet: print("Downloading Project Gutenberg book texts...") etexts = get_etexts("language", args.language) # Cycle through mirrors so as not to overload anyone's servers and get rate-limited etexts_with_mirrors = list(zip(etexts, itertools.cycle(MIRRORS))) etexts_iter = ( tqdm.tqdm(etexts_with_mirrors) if not args.quiet else etexts_with_mirrors ) success_count = 0 total_count = 0 try: for etext, mirror in etexts_iter: total_count += 1 try: load_etext(etext, mirror=mirror) success_count += 1 except GutenbergError as e: if not args.quiet: print(f"Failure (mirror: {mirror}) ", e) continue except KeyboardInterrupt: pass except Exception: print("Error with mirror: ", mirror, etext) raise if not args.quiet: print(f"{success_count} / {total_count} books downloaded to cache") print("Done!")
""" Created on Wed Aug 12 18:06:45 2015 @author: Tony Description: Pull etext numbers from Project Gutenberg for an author 1) First pip install gutenberg 0.4.0 library for Python from the command line """ from gutenberg.query import get_etexts from gutenberg.query import get_metadata from gutenberg.acquire import load_etext from gutenberg.cleanup import strip_headers # get the catalogue numbers of all the texts # by Wilhelm Grimm in Project Gutenberg bookList=get_etexts('author', 'Grimm, Wilhelm Carl') # gives bookList = [12704, 12705, 12706, 12707, 12708, 12709, 12710, 37381, 20051, 28044, 30510, 22555, 20050, 11027, 16846, 12250, 20027, 19068, 2591] #Once We can associate a number with a title we can pull the text for number in bookList: print(number,get_metadata('title',number)) print('\n HHHHHHHHHHHHHHH Now for the full text HHHHHHHHHHHHHHHHHHH \n') # Once we have the text number we can print the text # example 11027 is the number for Grimm's Fairy Stories # can be tempermental truncating text at top (console limit?) may need to trick around etext = strip_headers(load_etext(11027)).strip() print(etext)
# -*- coding: utf-8 -*- """ Created on Sun Sep 20 13:05:59 2015 @author: weizhi """ from gutenberg.query import get_etexts from gutenberg.query import get_metadata print(get_metadata('title', 2701)) # prints 'Moby Dick; Or, The Whale' print(get_metadata('author', 2701)) # prints 'Melville, Hermann' print(get_etexts('title', 'Moby Dick; Or, The Whale')) # prints (2701, ...) print(get_etexts('author', 'Melville, Hermann')) # prints (2701, ...)
return URIRef(value) class LanguageExtractor(_SimplePredicateRelationshipExtractor): """Extracts book languages. """ @classmethod def feature_name(cls): return 'language' @classmethod def predicate(cls): return DCTERMS.language / RDF.value @classmethod def contains(cls, value): return Literal(value) if __name__ == '__main__': from gutenberg.acquire.metadata import set_metadata_cache, SleepycatMetadataCache cache = SleepycatMetadataCache('/Users/deanjones/gutenberg_data') set_metadata_cache(cache) from gutenberg.query import get_etexts # texts = gutenberg.query.api.get_etexts('language', 'en') # print len(texts) print get_etexts('language', 'en')
def search_by_title(title): result = q.get_etexts('title', title) return list(result)
print("Hello. Welcome to the Gutenberg Analyser.") print("We begin by downloading the relevant texts for each author.") print("\n\n\n") print("Enter the number of authors whose works you want to download: ") n = int(input()) for j in range(n): print( "Enter the name of the author. Please make sure that string that you enter matches the author name on Project Gutenberg exactly" ) author = input() print("Name entered by you is: ", author) print("Loading books.....") originalList = (get_etexts('author', author)) dictionaryOfNames = OrderedDict( ) #contains names of the books and language of the book listOfTexts = [] #contains book number for i in originalList: try: text = strip_headers(load_etext(i)).strip() title = set(get_metadata('title', i)) lanugage = set(get_metadata('language', i)) dictionaryOfNames[title.pop()] = lanugage.pop() listOfTexts.append(i) except: pass #print("error found in download number",i)