def load_gutenberg(self, language='en'): texts = get_etexts('author', self.author) texts = { t: list(get_metadata("title", t))[0] for t in texts if list(get_metadata("language", t))[0] == language } new_texts = dict() dupes = list() for k, d in texts.items(): d = d.replace("\r\n", " ") if d not in dupes: dupes.append(d) new_texts[k] = d try: self.books[d] = strip_headers( load_etext(k)).strip().split("\r\n\r\n") except UnknownDownloadUriException: print( f'Book "{d}" does not have a text format and was not loaded.' ) del new_texts[k] dupes.remove(d) continue self.tokens[d] = [ nltk.pos_tag(nltk.word_tokenize(self.books[d][b])) for b in range(len(self.books[d])) ] else: pass texts = new_texts print(texts)
def init_books(author_file, json_file): """initialize book list with texts and save it to disk""" with open(author_file) as f: authors = list(f) authors = [i.strip() for i in authors] books = [] for author in authors: s = get_etexts('author', author) for i in s: try: if list(get_metadata('language', i))[0] == 'en': title, etext = list(get_metadata( 'title', i))[0], strip_headers(load_etext(i)).strip() b = Book(i, title, etext) books.append(b) except UnknownDownloadUriException: # this book does not have a load_etext corresponding to it. pass with open(json_file, 'wb') as f: pickle.dump(books, f) print(len(books))
def gatherMetaData(bookID, text): # Meta data types ''' taken from - https://github.com/hugovk/gutenberg-metadata id author formaturi - we want to filter or ignore this one. Since we are more interested in the supporting information. language rights subject title fulltext - we will add this on ''' subjectItems = [] title, = get_metadata("title", bookID) author, = get_metadata("author", bookID) language, = get_metadata("language", bookID) rights, = get_metadata("rights", bookID) subject = get_metadata("subject", bookID) for item in subject: subjectItems.append(item) # the dict needs to be turned into a byte array. # we can do this using the json library. # refs: https://stackoverflow.com/questions/19232011/convert-dictionary-to-bytes-and-back-again-python SubjectDict = dict(subject=subjectItems) SubjectToBytes = json.dumps(SubjectDict) # The last step is to turn the binary to a bytes array # ref: https://www.w3resource.com/python/python-bytes.php binary = bytes(SubjectToBytes, "utf8") storeInformationIntoDatabase(bookID, text, title, author, language, rights, binary)
def test_read_deleted_cache(self): self.cache.populate() set_metadata_cache(self.cache) self.cache.delete() try: get_metadata('title', 50405) except InvalidCacheException: pass
def test_read_unpopulated_cache(self): set_metadata_cache(self.cache) try: get_metadata('title', 50405) except InvalidCacheException: pass except: raise
def test_refresh(self): self.cache.populate() set_metadata_cache(self.cache) title = get_metadata('title', 30929) self.assertIn('Het loterijbriefje', title) self.cache.refresh() title = get_metadata('title', 30929) self.assertIn('Het loterijbriefje', title)
def trial(): text = strip_headers(load_etext(2701)).strip() print(text) # prints 'MOBY DICK; OR THE WHALE\n\nBy Herman Melville ...' print(get_metadata( 'title', 2701)) # prints frozenset([u'Moby Dick; Or, The Whale']) print(get_metadata('author', 2701)) # prints frozenset([u'Melville, Hermann']) print(get_etexts( 'title', 'Moby Dick; Or, The Whale')) # prints frozenset([2701, ...]) print(get_etexts('author', 'Melville, Herman')) # prints frozenset([2701, ...])
def metadata(book_id): """ Args: book_id: book id (integer) Returns: metadata of that book: a dictionary with keys ["title", "authors", "language", "bookshelves"], which bookshelves is an empty set """ title = {str(x) for x in gq.get_metadata('title', book_id)} authors = {str(x) for x in gq.get_metadata('author', book_id)} language = {str(x) for x in gq.get_metadata('language', book_id)} bookshelves = set() return book_metadata(book_id, title, authors, language, bookshelves)
def test_repopulate(self): self.cache.populate() set_metadata_cache(self.cache) self.cache.delete() self.cache.populate() title = get_metadata('title', 30929) self.assertIn(u('Het loterijbriefje'), title)
def get_all_metadata(last_ebook_id): metadata = AutoVivification() i = 1 # First ebook starts at 1 while i <= last_ebook_id: if i % 100 == 0: sys.stdout.write(str(i) + "\r") for feature_name in [ "author", "formaturi", "language", "rights", "subject", "title", ]: data = get_metadata(feature_name, i) metadata[i][feature_name] = data i += 1 sys.stdout.write("\r\n") # from pprint import pprint # pprint(metadata) with open("gutenberg-metadata.json", "w") as fp: json.dump( metadata, fp, cls=SetEncoder, indent=0, separators=(",", ":"), sort_keys=True, )
def acquire_corpora(): while True: book = randint(100, 10000) if get_metadata('title', book): # uri = get_uri(book, 'images') uri = 'http://www.gutenberg.org/ebooks/{}'.format(book) return [book, get_title(book), uri, get_author(book)]
def get_joyce_texts(): joyce_keys = get_etexts('author', 'Joyce, James') joyce_titles = [] joyce_texts = {} for key in joyce_keys: joyce_titles.append(get_metadata('title', key)) joyce_texts[key] = strip_headers(load_etext(key)).strip() return (joyce_texts)
def get_metadata(idx): ret = {} keys = q.list_supported_metadatas() for key in keys: ret[key] = list(q.get_metadata(key, idx)) content = get_content(idx) ret['length'] = len(content) print(ret) return ret
def get_all_titles(): my_catalog = book_catalog() for i in range(1, 65000): title = ''.join(list(get_metadata('title', i))) if (title): lang = list(get_metadata('language', i))[0] if lang == 'en': my_book = book( i, ''.join(list(get_metadata('author', i))).replace( "\n", " ").replace("\r", " "), ''.join(list(get_metadata('title', i))).replace("\n", " ").replace( "\r", " "), ''.join(list(get_metadata('subject', i))).replace( "\n", " ").replace("\r", " ")) my_catalog.add_book(my_book) return (my_catalog)
def _run_get_metadata_for_feature(self, feature): for testcase in self.sample_data(): expected = getattr(testcase, feature) actual = get_metadata(feature, testcase.etextno) self.assertTrue( set(actual) == set(expected), u('non-matching {feature} for book {etextno}: ' 'expected={expected} actual={actual}').format( feature=feature, etextno=testcase.etextno, actual=actual, expected=expected))
def _run_get_metadata_for_feature(self, feature): for testcase in self.sample_data(): expected = getattr(testcase, feature) actual = get_metadata(feature, testcase.etextno) self.assertTrue( set(actual) == set(expected), u('non-matching {feature} for book {etextno}: ' 'expected={expected} actual={actual}') .format( feature=feature, etextno=testcase.etextno, actual=actual, expected=expected))
def search(query: str, include: Optional[str] = None) -> List[dict]: fields = parse_include(include) if include else [] conjunction = parse_search(query) parts = iter(get_etexts(field, value) for field, value in conjunction) results = set(next(parts)) [results.intersection_update(part) for part in parts] # type: ignore return [ dict([('text_id', text_id)] + [(field, get_metadata(field, text_id)) for field in fields]) for text_id in results ]
def main(): """ Main function of the test module """ # setting up the API keys from local keys.py file goodreads_key = os.environ['GOODREADS_KEY'] goodreads_secret = os.environ['GOODREADS_SECRET'] # creating a client for book search and information retrieval gc = client.GoodreadsClient(goodreads_key, goodreads_secret) current_path = os.getcwd() file = open(os.path.join(current_path, "output", "log.json"), "w") gutenberg_titles = [] # Getting the title of the first 3000 books on Project Gutenberg (EXTREMELY FAST) for i in range(1, 10): title = list(get_metadata('title', i)) if title: # prepare the string for the file name filename = ''.join( e for e in title[0] if e.isalnum() or e == ' ') + ".txt" gutenberg_titles.append(filename[:-4]) text = strip_headers(load_etext(i)).strip() with open(os.path.join(current_path, "output", filename), "w") as output_file: output_file.write(text) titles = dict() # Searching for the books on Goodreads, reading their metadata for book_title in gutenberg_titles: try: lst = gc.search_books(book_title, search_field='title') if not lst: continue else: book = lst[0] titles[book.title] = ( book_title + ".txt", str(book.popular_shelves), str(book.similar_books), str(book.authors), dict(dict(book.work)['original_publication_year'])['#text']) except (request.GoodreadsRequestException, KeyError, TypeError): continue json.dump(titles, file, indent=4) file.close()
def get_title_gutenberg(gutenberg_id): """ Gets title for novel with this gutenberg id >>> from gender_novels import corpus_gen >>> get_title_gutenberg(33) 'The Scarlet Letter' """ title = list(get_metadata('title', gutenberg_id))[0] for sep in TRUNCATORS: title = title.split(sep,1)[0] return title
def generate_tweets(gutenberg_id, total=24): document = [] text = strip_headers(load_etext(gutenberg_id)).strip() lines = text.split('\n') print get_metadata('title', gutenberg_id) for line in lines: words = re.findall(regex, line) document.extend(words) trigrams = zip(document, document[1:], document[2:]) trigram_transitions = defaultdict(list) starts = [] for prev, current, next in trigrams: if prev == ".": starts.append(current) trigram_transitions[(prev, current)].append(next) def generate_using_trigrams(): current = random.choice(starts) prev = "." result = [current] while True: next_word_candidates = trigram_transitions[(prev, current)] next_word = random.choice(next_word_candidates) prev, current = current, next_word if current != ".": result.append(current) else: return " ".join(result) + current tweets = [] while len(tweets) < total: tweet = generate_using_trigrams() if len(tweet) <= 140: tweets.append(tweet) return tweets
def generate_tweets(gutenberg_id, total=24): document = [] text = strip_headers(load_etext(gutenberg_id)).strip() lines = text.split('\n') print get_metadata('title', gutenberg_id) for line in lines: words = re.findall(regex, line) document.extend(words) trigrams = zip(document, document[1:], document[2:]) trigram_transitions = defaultdict(list) starts = [] for prev, current, next in trigrams: if prev == ".": starts.append(current) trigram_transitions[(prev, current)].append(next) def generate_using_trigrams(): current = random.choice(starts) prev = "." result = [current] while True: next_word_candidates = trigram_transitions[(prev, current)] next_word = random.choice(next_word_candidates) prev, current = current, next_word if current != ".": result.append(current) else: return " ".join(result) + current tweets = []; while len(tweets) < total: tweet = generate_using_trigrams() if len(tweet) <= 140: tweets.append(tweet) return tweets
def get_subject_gutenberg(gutenberg_id): """ Tries to get subjects >>> from gender_novels import corpus_gen >>> get_subject_gutenberg(5200) ['Metamorphosis -- Fiction', 'PT', 'Psychological fiction'] :param: author: str :param: title: str :param: id: int :return: list """ # TODO: run doctest on computer with populated cache return sorted(list(get_metadata('subject', gutenberg_id)))
def get_author_gutenberg(gutenberg_id): """ Gets author or authors for novel with this gutenberg id >>> from gender_novels import corpus_gen >>> get_author_gutenberg(33) ['Hawthorne, Nathaniel'] >>> get_author_gutenberg(3178) ['Twain, Mark', 'Warner, Charles Dudley'] :param gutenberg_id: int :return: list """ # TODO: should we format author names like this? return list(get_metadata('author', gutenberg_id))
def language_invalidates_entry(gutenberg_id): """ Returns False if book with gutenberg id is in English, True otherwise >>> from gender_novels.corpus_gen import language_invalidates_entry >>> language_invalidates_entry(46) # A Christmas Carol False >>> language_invalidates_entry(27217) # Some Chinese thing True :param gutenberg_id: int :return: boolean """ language = list(get_metadata('language', gutenberg_id))[0] if language != 'en': return True else: return False
def rights_invalidate_entry(gutenberg_id): """ Returns False if book with gutenberg id is in public domain in US, True otherwise >>> from gender_novels.corpus_gen import rights_invalidate_entry >>> rights_invalidate_entry(5200) # Metamorphosis by Franz Kafka True >>> rights_invalidate_entry(8066) # The Bible, King James version, Book 66: Revelation False :param gutenberg_id: int :return: boolean """ rights = get_metadata('rights', gutenberg_id) if 'Public domain in the USA.' in rights: return False else: return True
def process(question, candidates=None, top_n=3, n_docs=3): torch.cuda.empty_cache() title = '' author = '' predictions = DrQA.process( question, candidates, top_n, n_docs, return_context=True ) table = prettytable.PrettyTable( ['Rank', 'Answer', 'Doc-ID', 'Doc-Title', 'Doc-Author', 'Doc-Link', 'Answer Score', 'Doc Score'] ) for i, p in enumerate(predictions, 1): if not list(get_metadata('title', p['doc_id'])): title = 'Not Available' else: tittle = list(get_metadata('title', p['doc_id']))[0] if not list(get_metadata('author', p['doc_id'])): author = 'Not Available' else: author = list(get_metadata('author', p['doc_id']))[0] if not list(get_metadata('formaturi', p['doc_id'])): url = 'Not Available' else: url = list(get_metadata('formaturi', p['doc_id']))[0] table.add_row([i, p['span'], p['doc_id'], tittle, author, url, '%.5g' % p['span_score'], '%.5g' % p['doc_score']]) print('Top Predictions:') print(table) strtable = table.get_string() ''' print('\nContexts:') for p in predictions: text = p['context']['text'] start = p['context']['start'] end = p['context']['end'] output = (text[:start] + colored(text[start: end], 'green', attrs=['bold']) + text[end:]) print('[ Doc = %s ]' % p['doc_id']) print(output + '\n') print(type(strtable)) print(type(p['doc_id'])) print(type(output)) #code.interact(banner=banner, local=locals()) retstring = strtable + '\n' + '[ Doc = ' + str(p['doc_id']) + ']' + '\n' + output + '\n' ''' return strtable
def random_gutenberg_document(language_filter='en') -> str: """Downloads a random document (book, etc.) from Project Gutenberg and returns it as a stirng. Keyword arguments: language_filter (str) -- restrict the random document to a paritcular language (default: English) """ doc_language = None document = '' while (not doc_language or language_filter ) and doc_language != language_filter and len(document) == 0: # Keep grabbing random documents until 1 meets the language filter, if specified, and verify it really has text document_id = random.randint( 1, 60134) # Pick book at random (max id is currently 60134) lang_metadata = get_metadata('language', document_id) doc_language = next( iter(lang_metadata)) if len(lang_metadata) else False document = super_cleaner(strip_headers( load_etext(document_id).strip()), mark_deletions=False) return document
def getNBooks(nBooks, lang, loc): from gutenberg.acquire import load_etext from gutenberg.query import get_metadata from gutenberg.cleanup import strip_headers i = 0 while i < nBooks: n = rr(0, 10000) try: l = get_metadata("language", n) # if('en' not in l): # print(l) if (lang in l): t = strip_headers(load_etext(n)).strip() f = open(loc + str(n) + '.txt', 'w') f.write(t) f.flush() f.close() print(i + 1, n) i += 1 except: pass
def main(): """ Main function of the test module """ # setting up the API keys from local keys.py file goodreads_key = os.environ['GOODREADS_KEY'] goodreads_secret = os.environ['GOODREADS_SECRET'] # creating a client for book search and information retrieval gc = client.GoodreadsClient(goodreads_key, goodreads_secret) current_path = os.getcwd() file = open(os.path.join(current_path, "output", "log.txt"), "a") # Getting the title of the first 3000 books on Project Gutenberg (EXTREMELY FAST) for i in range(1, 10): title = list(get_metadata('title', i)) if title: # prepare the string for the file name filename = ''.join(e for e in title[0] if e.isalnum()) + ".txt" text = strip_headers(load_etext(i)).strip() with open(os.path.join(current_path, "output", filename), "w") as output_file: output_file.write(text) file.write(f"{title[0]} plaintext saved to '{title[0]}.txt'\n") # Getting the titles and publishing years for the first 3000 books on Goodreads # Pretty slow because Goodreads allows 1 request per second for i in range(1, 20): try: book = gc.book(i) file.write( f"{book.title} - published in {dict(dict(book.work)['original_publication_year'])['#text']}\n" ) except (request.GoodreadsRequestException, KeyError): continue
def isvalid(id_num): """ Check if a gutenberg book is an english textbook. Args: id_num: id of gutenberg book. Returns: a boolean which shows if id_num is id of an english text book """ try: language = metadata(id_num)['language'] form = get_metadata('formaturi', id_num) if 'en' not in language: return False form = ' '.join(form) if re.search(r'\d+\.txt', form): return True return False except: return False
def download(): """ ściąga dokumenty :return: """ for i in range(1, 3000): try: query = strip_headers(load_etext(i)).strip() except Exception: continue if 'DOCTYPE HTML PUBLIC' not in query[0:100]: a = ''.join(get_metadata('title', 2701)) with open('files/' + a + '.txt', 'a') as f: try: f.write(query) print('Downloaded ' + a + '.\n') except UnicodeEncodeError: f.close() os.remove('files/' + a + '.txt') continue else: print("Not Downloaded")
# get a random sample of labels. from gutenberg.query import get_metadata import random random.seed(a=19031003) random_docs = random.sample(range(57700), 100) labels = set() for doc in random_docs: subjects = get_metadata('subject', doc) for s in subjects: labels.add(s) with open('genres.txt', 'w') as file: file.write(str(labels))
if __name__ == '__main__': mypath = "/home/ssamot/projects/github/gutenberg/processed/results/" onlyfiles = [ (join(mypath,f), f[:-4], f) for f in listdir(mypath) if isfile(join(mypath,f)) and f.endswith(".txt")] means = np.loadtxt("./data/ytotals.csv") means = list(means) totals = [] print means starting_point = len(means) print starting_point for i, file in enumerate(onlyfiles): if(i> starting_point ): print i, starting_point fictid = int( file[-1].split("_")[0]) title = list(get_metadata('title', fictid)) author = list(get_metadata('author', fictid)) print fictid, title, author if(author == []): author.append("") #print get_metadata("author", fictid) try: mean, total = get_mean(title[0] + " " + author[0]) #time.sleep(2.0) means.append(mean) totals.append(total) except KeyError: means.append(-1) #print means #print np.array(means)
""" Created on Wed Aug 12 18:06:45 2015 @author: Tony Description: Pull etext numbers from Project Gutenberg for an author 1) First pip install gutenberg 0.4.0 library for Python from the command line """ from gutenberg.query import get_etexts from gutenberg.query import get_metadata from gutenberg.acquire import load_etext from gutenberg.cleanup import strip_headers # get the catalogue numbers of all the texts # by Wilhelm Grimm in Project Gutenberg bookList=get_etexts('author', 'Grimm, Wilhelm Carl') # gives bookList = [12704, 12705, 12706, 12707, 12708, 12709, 12710, 37381, 20051, 28044, 30510, 22555, 20050, 11027, 16846, 12250, 20027, 19068, 2591] #Once We can associate a number with a title we can pull the text for number in bookList: print(number,get_metadata('title',number)) print('\n HHHHHHHHHHHHHHH Now for the full text HHHHHHHHHHHHHHHHHHH \n') # Once we have the text number we can print the text # example 11027 is the number for Grimm's Fairy Stories # can be tempermental truncating text at top (console limit?) may need to trick around etext = strip_headers(load_etext(11027)).strip() print(etext)
# -*- coding: utf-8 -*- """ Created on Sun Sep 20 13:05:59 2015 @author: weizhi """ from gutenberg.query import get_etexts from gutenberg.query import get_metadata print(get_metadata('title', 2701)) # prints 'Moby Dick; Or, The Whale' print(get_metadata('author', 2701)) # prints 'Melville, Hermann' print(get_etexts('title', 'Moby Dick; Or, The Whale')) # prints (2701, ...) print(get_etexts('author', 'Melville, Hermann')) # prints (2701, ...)