def test_load_etext(self): loaders = (lambda etextno: load_etext(etextno, refresh_cache=True), lambda etextno: load_etext(etextno, refresh_cache=False)) testcases = ( SampleMetaData.for_etextno(2701), # newstyle identifier SampleMetaData.for_etextno(5), # oldstyle identifier SampleMetaData.for_etextno(14287), # unicode text SampleMetaData.for_etextno(23962) # UTF-8 text ) for testcase, loader in itertools.product(testcases, loaders): text = loader(testcase.etextno) self.assertTrue(isinstance(text, str))
def test_load_etext(self): loaders = (lambda etextno: load_etext(etextno, refresh_cache=True), lambda etextno: load_etext(etextno, refresh_cache=False)) testcases = ( SampleMetaData.for_etextno(2701), # newstyle identifier SampleMetaData.for_etextno(5), # oldstyle identifier SampleMetaData.for_etextno(14287), # unicode text SampleMetaData.for_etextno(23962) # UTF-8 text ) for testcase, loader in itertools.product(testcases, loaders): text = loader(testcase.etextno) self.assertIsInstance(text, unicode)
def generate_samples(self, data_dir, tmp_dir, dataset_split): del data_dir del tmp_dir del dataset_split # pylint: disable=g-import-not-at-top from gutenberg import acquire from gutenberg import cleanup # pylint: enable=g-import-not-at-top books = [ # bookid, skip N lines (19221, 223), (15553, 522), ] for (book_id, toskip) in books: text = cleanup.strip_headers(acquire.load_etext(book_id)).strip() lines = text.split("\n")[toskip:] prev_line = None ex_count = 0 for line in lines: # Any line that is all upper case is a title or author name if not line or line.upper() == line: prev_line = None continue line = re.sub("[^a-z]+", " ", line.strip().lower()) if prev_line and line: yield { "inputs": prev_line, "targets": line, } ex_count += 1 prev_line = line
def generate_paragraph(): ''' Generates a random paragraph from the Gutenberg Project :return: Text the Guttenberg Project with spaces and non-alphabetic characters removed\ and all characters lower case :rtype: str ''' #Get the text from Gutenberg Project, in this case its Moby Dick text = strip_headers(load_etext(2701)).strip() #text = "Jack and Jill ran up the hill to get a pail of water. " + # "Jack fell down and broke his crown and Jill came tumbling after." sentences = [] paragraph = "" for sentence in text.split("."): sentences.append(sentence) #Select 2 random sentences paragraph = random.choice(sentences) + random.choice(sentences) paragraph = re.sub(r'\s+', '', paragraph) regex = re.compile('[^a-zA-Z]') paragraph = regex.sub('', paragraph).lower() return paragraph
def get_featurelists(book): # Preparation for topic features: get 100 most common uni-, bi- and trigrams of the given book common_ngrams = get_common_ngrams(book) # Extract the features of the given book features_book = (book_id, extract_features(book, common_ngrams)) # Create new file and write the features of the given book to it path_feat_book = "C:/Users/gebruiker/Documents/Master/Text/Project/output_data/features_book.txt" with open(path_feat_book, 'r+', encoding="utf-8") as output_book: output_book.write(str(features_book)) output_book.close() # Create new file to write the features of the dataset books to path_feat_books = "C:/Users/gebruiker/Documents/Master/Text/Project/output_data/features_dataset.txt" output_dataset = open(path_feat_books, 'r+', encoding="utf-8") # Extract the features of the dataset books features_dataset = [] for i in IDs: features_dataset.append((i, extract_features( strip_headers(load_etext(i)).strip(), common_ngrams))) # Write the features to the output file output_dataset.write("\n Book " + str(i) + ": ") output_dataset.write(str(features_dataset[len(features_dataset) - 1])) output_dataset.close() return features_book, features_dataset
def get_gutenberg_text(id): try: text = strip_headers(load_etext(id)).strip() return text except Exception as ex: print(ex) return ''
def load_gutenberg(self, language='en'): texts = get_etexts('author', self.author) texts = { t: list(get_metadata("title", t))[0] for t in texts if list(get_metadata("language", t))[0] == language } new_texts = dict() dupes = list() for k, d in texts.items(): d = d.replace("\r\n", " ") if d not in dupes: dupes.append(d) new_texts[k] = d try: self.books[d] = strip_headers( load_etext(k)).strip().split("\r\n\r\n") except UnknownDownloadUriException: print( f'Book "{d}" does not have a text format and was not loaded.' ) del new_texts[k] dupes.remove(d) continue self.tokens[d] = [ nltk.pos_tag(nltk.word_tokenize(self.books[d][b])) for b in range(len(self.books[d])) ] else: pass texts = new_texts print(texts)
def load_macbeth(): """ Sources Macbeth from Project Gutenberg, returns a cleaned dataframe of the play split by act, scene, speaker, and sentence. """ raw_text = load_etext(1533) # Collect the text raw_text = strip_headers(raw_text) # Remove most metadata # Remove in-line stage directions raw_text = remove_in_line_stage_directions(raw_text) # Split the text into sentences sentences = separate_sentences(raw_text) # Remove introductory data, keeping only the text sentences = sentences[110:] # Create a dataframe from the sentences macbeth = create_play_data_frame(sentences) # Clean the dataframe macbeth = clean_macbeth(macbeth) # Add a token column macbeth["tokens"] = create_token_column(macbeth["sentence"]) # Return the finished dataframe return macbeth
def generate_samples(self, data_dir, tmp_dir, dataset_split): del data_dir del tmp_dir del dataset_split books = [ # bookid, skip N lines (19221, 223), (15553, 522), ] for (book_id, toskip) in books: text = cleanup.strip_headers(acquire.load_etext(book_id)).strip() lines = text.split("\n")[toskip:] for line in lines: # Any line that is all upper case is a title or author name if not line or line.upper() == line: continue line = re.sub("[^a-z]+", " ", line.strip().lower()) if line: l = len(line) if l > 100: l = 100 yield { "inputs": line, "label": l, }
def main(): """ The main method. """ parser = argparse.ArgumentParser( description='Word suggestion based on Project Gutenberg books.') parser.add_argument('--book-id', dest='book_ids', nargs='+', type=int, required=True, help='the book id of the Project Gutenberg') parser.add_argument('--query', nargs='+', type=str, required=True, help='suggest next word for list of string', action=required_length(1, 5)) try: args = parser.parse_args() text_indexer = TextIndexer(len(args.query)) for book_id in list(dict.fromkeys(args.book_ids)): text = strip_headers(load_etext(book_id)).strip() text_indexer.add_text(book_id, text) print(text_indexer.suggest(*args.query)) except Exception as exc: # pylint: disable=W0703 print(exc)
def getSomeBooks(howManyBooks, startingAt=1): i = howManyBooks ii = startingAt nothing = 0 valError = 0 otherError = 0 allTheBooks = [] while i > len(allTheBooks): # 54096 ceiling try: theText = strip_headers( load_etext(ii)).strip() #load the full text into theText theLength = len(theText) if len(theText) > 292: allTheBooks.append([ii, theText]) print("one more book in the list, book number:", i, "book total is:", len(allTheBooks)) else: nothing = nothing + 1 print("nothing here at number:", i) except ValueError: valError = valError + 1 print("valueError at book number:", i) except: otherError = otherError + 1 print("otherError at book number:", i) ii = ii + 1 print('all done') print(len(allTheBooks)) return allTheBooks
def post_corpora(url, auth_token): corpora = acquire_corpora() text = strip_headers(load_etext(corpora[0])).strip() print(corpora, text[:100]) authentication_token = {'authentication-token': auth_token} # data to post files = {'file': io.StringIO(text)} data = { 'label': '{} {}'.format(corpora[1], corpora[3]), 'source': corpora[2] } # post ru = requests.post(url, headers=authentication_token, files=files, data=data) print(ru.url, ru.status_code) if ru.ok: print(ru.json()) else: print(ru.status_code, ru.reason)
def download(cfg): print('Downloading Gutenberg data to: ' + cfg.directory) # Load language data for all books. path = os.path.join('code', 'utils', 'metadata.txt') with open(path, encoding='utf-8') as f: counter = 0 for line in f: [index, lang, r, author, title] = line.split('\t') r = int(r) i = int(index) if counter < cfg.max_books and r == 1 and lang in cfg.languages: # Get the book. try: text = strip_headers(load_etext(i)).strip().encode('utf-8') except UnknownDownloadUriException: print('Could not download book: ' + str(i)) continue # Save the file to the correct directory. path = os.path.join(cfg.directory, lang) if not os.path.exists(path): os.mkdir(path) with open(os.path.join(path, str(i) + '.txt'), 'wb') as f: f.write(text) counter += 1 if not counter % 1000: print('Downloaded ' + str(counter) + ' books')
def init_books(author_file, json_file): """initialize book list with texts and save it to disk""" with open(author_file) as f: authors = list(f) authors = [i.strip() for i in authors] books = [] for author in authors: s = get_etexts('author', author) for i in s: try: if list(get_metadata('language', i))[0] == 'en': title, etext = list(get_metadata( 'title', i))[0], strip_headers(load_etext(i)).strip() b = Book(i, title, etext) books.append(b) except UnknownDownloadUriException: # this book does not have a load_etext corresponding to it. pass with open(json_file, 'wb') as f: pickle.dump(books, f) print(len(books))
def poetry_cleaner(poetry_books=BOOKS): with open(INPUT_DATA_WRITE_PATH + OUT_PATH, 'w') as ofp: lineno = 0 for (id_nr, toskip, title) in poetry_books: startline = lineno text = strip_headers(load_etext(id_nr)).strip() lines = text.split('\n')[toskip:] for line in lines: if 0 < len(line) < 50 and line.upper( ) != line and not re.match('.*[0-9]+.*', line): cleaned = re.sub('[^a-z\'\-]+', ' ', line.strip().lower()) if lineno < 100: ofp.write(cleaned) ofp.write('\n') lineno = lineno + 1 else: ofp.write('\n') print('Wrote lines {} to {} from {}'.format(startline, lineno, title))
def sample_paragraphs(book_id, n_parag, min_length): """Get book as text file and randomly sample a fixed number of paragraphs.""" # Get book as string and emove metadata book = load_etext(book_id) # Remove metadata book = strip_headers(book).strip() # Remove the character we'll choose as separator book = book.replace("|", " ") # Split paragraphs parag = book.split("\n\n") # Remove single line breaks parag = [x.replace("\n", " ") for x in parag] # Remove paragraphs below a certain length parag = [p for p in parag if len(p) > min_length] # Exclude first/last 10 parag from sampling as they may contain remaining metadata parag = parag[10:-10] # Sample paragraphs seed(42) sample_ind = randint(0, len(parag), n_parag) if n_parag is not None: if n_parag > len(parag): raise ValueError( "The number of paragraphs to sample is higher than the " "total number of paragraphs." ) else: parag_sampled = [parag[i] for i in sample_ind] else: # If n_parag is None, all paragraphs are sampled parag_sampled = parag return parag_sampled
def regular_view(request, book_num): name = Book.get_book_name(book_num) bookText = strip_headers(load_etext(book_num)).strip() filteredText = removeStopWords(bookText) args = {'content': [bookText], 'content2': [filteredText], 'name': name} return render(request, "pages/regularText.html", args)
def get_raw_book(): while True: try: text = load_etext(random.randrange(46000)) #46000 is approximately size of gutenberg catalogue except ValueError: #in case of no download method for that text id pass else: return strip_headers(text)
def get_joyce_texts(): joyce_keys = get_etexts('author', 'Joyce, James') joyce_titles = [] joyce_texts = {} for key in joyce_keys: joyce_titles.append(get_metadata('title', key)) joyce_texts[key] = strip_headers(load_etext(key)).strip() return (joyce_texts)
def search_display_options(my_catalog): search_result_catalog = book_catalog() search_type = input( 'Please select a search type: Author, Subject, Title [Aa/Ss/Tt]: ') if search_type == 'A' or search_type == 'a': search_term = input('Please enter a search term for an Author: ') elif search_type == 'T' or search_type == 't': search_term = input('Please enter a search term for a Title: ') elif search_type == 'S' or search_type == 's': search_term = input('Please enter a search term for a Subject: ') else: print('Invalid search type...') return # set match flag to false match = False # fill up a set of all the titles that match the search for my_book in my_catalog.get_books(): if (search_type == 'a' or search_type == 'A') and set( my_book.get_book_author().lower().split(' ')).intersection( set(search_term.lower().split(' '))): search_result_catalog.add_book(my_book) match = True if (search_type == 't' or search_type == 'T') and set( my_book.get_book_title().lower().split(' ')).intersection( set(search_term.lower().split(' '))): search_result_catalog.add_book(my_book) match = True if (search_type == 's' or search_type == 'S') and set( my_book.get_book_subject().lower().split(' ')).intersection( set(search_term.lower().split(' '))): search_result_catalog.add_book(my_book) match = True search_result_catalog.display_titles_by_author() if match: title_num = input('Please type a title number from the above list: ') print('Displaying Word Cloud in [Subject: ' + my_book.get_book_subject() + '] for [Title: ' + my_book.get_book_title() + '] by [Author:' + my_book.get_book_author() + ']') try: my_book = search_result_catalog.get_book(title_num) return (strip_headers(load_etext(int(title_num))).strip() ) # call that gets bok text from gutenberg except: print('Failed to find a textual download candidate for ' + my_book.get_book_title()) return (None) else: print('No matches found for [' + search_term + ']...') return (None)
def text_from_pg(id_number): # https://github.com/c-w/Gutenberg from gutenberg.acquire import load_etext # from gutenberg.cleanup import strip_headers # text = strip_headers(load_etext(id_number)).strip() text = load_etext(id_number).strip() return text
def tab(): with open("BookRoulette.html", "w") as f: x = (random.randint(1, 60059)) book = strip_headers(load_etext(x)).strip() f.write(book) f.close filename = 'file:///'+os.getcwd()+'/' + 'BookRoulette.html' webbrowser.open_new_tab(filename) return render_template('BookRoulette.html', book=book)
def acquire_and_process(name: str, txt_num: int): """ Convenience function that minhashes a Project Gutenberg text given the text id number (can be found on the gutenberg.org, for instance in the url). """ txt = strip_headers(load_etext(txt_num)) with open("texts/%s.txt" % name, "w") as f: f.write(txt) process_file("texts/%s.txt" % name)
def download(): with open("GutProject.doc", "w") as f: x = (random.randint(1, 60059)) text = strip_headers(load_etext(x)).strip() f.write(text) f.close() return send_file('GutProject.doc', mimetype='application/msword', attachment_filename='GutProject.doc', as_attachment=True)
def downloadBook(): """If posting, takes in a book number from getty.html, installs the book into the database. Otherwise displays getty.html""" if request.method == "POST": bookNum = int(request.form.get("bookNum")) words = strip_headers(load_etext(bookNum)).strip() installText(words) return render_template("homepage.html") else: return render_template("getty.html")
def __init__(self, book_number=2701, first_page=20, last_page=20): self.text = strip_headers(load_etext(book_number)) # print(list_supported_metadatas()) # prints (u'author', u'formaturi', u'language', ...) # print(get_metadata('title', 2701)) # prints frozenset([u'Moby Dick; Or, The Whale']) # print(get_metadata('author', 2701)) # prints frozenset([u'Melville, Hermann']) # print(text) # prints 'MOBY DICK; OR THE WHALE\n\nBy Herman Melville ...' self.pages = [] self.first_page = first_page self.last_page = last_page self.print_book()
def gutToText(number,name): filename = name+"_raw.txt" if os.path.isfile(filename)==False: book = open(filename,"w") text = strip_headers(load_etext(number)).strip() words = text print "Loaded and writing %s" % (name) book.write(words.encode('utf-8')) print "Done writing %s" % (name) book.close()
def get_text(self, title, author): """ This function will access the title and author of a book from the Gutenberg project and save the data as a csv file PROBLEM HERE -- gutenberg goes down a lot, so getting a full text did not work. To bypass that, I downloaded some books of mixed languages. """ guten_number = get_etexts('title', title)[0] text = strip_headers(load_etext(guten_number)).strip() return (text)
def get_gutenberg_document(url) -> str: """Downloads a document (book, etc.) from Project Gutenberg and returns it as a string.""" # Get Project Gutenberg document ID from url string validate_url(url, expected_netloc='gutenberg.org') match = re.search("(?:files|ebooks|epub)\/(\d+)", urlsplit(url).path) if not match: raise Exception('Not a valid url') document_id = int(match.group(1)) return super_cleaner(strip_headers(load_etext(document_id).strip()), mark_deletions=False)
def acquire_and_process(name: str, txt_num: int): """ Convenience function that minhashes a Project Gutenberg text given the text id number (can be found on the gutenberg.org, for instance in the url). """ txt = strip_headers( load_etext(txt_num) ) with open("texts/%s.txt" % name, "w") as f: f.write(txt) process_file("texts/%s.txt" % name)
def generateBooks(lastBookID): firstBookID = 1 # look through and grab each book while firstBookID <= lastBookID: # load and grab the eBook try: text = strip_headers(load_etext(firstBookID)).strip() gatherMetaData(firstBookID, text) firstBookID = firstBookID + 1 except: print("error with book", firstBookID) firstBookID = firstBookID + 1
def trial(): text = strip_headers(load_etext(2701)).strip() print(text) # prints 'MOBY DICK; OR THE WHALE\n\nBy Herman Melville ...' print(get_metadata( 'title', 2701)) # prints frozenset([u'Moby Dick; Or, The Whale']) print(get_metadata('author', 2701)) # prints frozenset([u'Melville, Hermann']) print(get_etexts( 'title', 'Moby Dick; Or, The Whale')) # prints frozenset([2701, ...]) print(get_etexts('author', 'Melville, Herman')) # prints frozenset([2701, ...])
def create_model(): """Read in Project Gutenberg data, convert each into a markovify Text model object, then combine them into one model. Returns the model. """ eap_1 = strip_headers(load_etext(2147)).strip() #edgar allan poe vol 1 eap_2 = strip_headers(load_etext(2148)).strip() #edgar allan poe vol 2 dickens = strip_headers( load_etext(807)).strip() #charles dickens crime stories moonstone = strip_headers(load_etext(155)).strip() #collins: the moonstone lerouge = strip_headers( load_etext(3802)).strip() #gaboriau: the lerouge case orcival = strip_headers( load_etext(1651)).strip() #gaboriau: the mystery of orcival calais = strip_headers( load_etext(16339)).strip() #griffiths: the passenger from calais\ leavenworth = strip_headers( load_etext(4047)).strip() #griffiths: the passenger from calais agent = strip_headers(load_etext(974)).strip() #conrad: the secret agent thirtynine = strip_headers( load_etext(558)).strip() #conrad: the secret agent eap_1_model = markovify.Text(eap_1, state_size=3) eap_2_model = markovify.Text(eap_2, state_size=3) dickens_model = markovify.Text(dickens, state_size=3) moonstone_model = markovify.Text(moonstone, state_size=3) lerouge_model = markovify.Text(lerouge, state_size=3) orcival_model = markovify.Text(orcival, state_size=3) calais_model = markovify.Text(calais, state_size=3) leavenworth_model = markovify.Text(leavenworth, state_size=3) agent_model = markovify.Text(agent, state_size=3) thirtynine_model = markovify.Text(thirtynine, state_size=3) #NOTE: will need to play around with the weighting based on the text lengths so that I don't get all sentences from one book all_model = markovify.combine([ eap_1_model, eap_2_model, dickens_model, moonstone_model, lerouge_model, orcival_model, calais_model, leavenworth_model, agent_model, thirtynine_model ]) return all_model
def main(): eap_1 = strip_headers(load_etext(2147)).strip() #edgar allan poe vol 1 eap_2 = strip_headers(load_etext(2148)).strip() #edgar allan poe vol 2 dickens = strip_headers( load_etext(807)).strip() #charles dickens crime stories moonstone = strip_headers(load_etext(155)).strip() #collins: the moonstone lerouge = strip_headers( load_etext(3802)).strip() #gaboriau: the lerouge case orcival = strip_headers( load_etext(1651)).strip() #gaboriau: the mystery of orcival eap_1_model = markovify.Text(eap_1, state_size=3) dickens_model = markovify.Text(dickens, state_size=3) moonstone_model = markovify.Text(moonstone, state_size=3) lerouge_model = markovify.Text(lerouge, state_size=3) orcival_model = markovify.Text(orcival, state_size=3) #NOTE: will need to play around with the weighting based on the text lengths so that I don't get all sentences from one book all_model = markovify.combine([ eap_1_model, eap_2_model, dickens_model, moonstone_model, lerouge_model, orcival_model ]) #to do: loop to create different chapters - probably make them short (~ten sentences?) at first to make sure that they work properly print "\n\n\n Creating Chapters" chapters = create_chapters( ) #this will be a list of all the chapters, they should be complete at this point (all replacement/etc done) """
def main(): """ Main function of the test module """ # setting up the API keys from local keys.py file goodreads_key = os.environ['GOODREADS_KEY'] goodreads_secret = os.environ['GOODREADS_SECRET'] # creating a client for book search and information retrieval gc = client.GoodreadsClient(goodreads_key, goodreads_secret) current_path = os.getcwd() file = open(os.path.join(current_path, "output", "log.json"), "w") gutenberg_titles = [] # Getting the title of the first 3000 books on Project Gutenberg (EXTREMELY FAST) for i in range(1, 10): title = list(get_metadata('title', i)) if title: # prepare the string for the file name filename = ''.join( e for e in title[0] if e.isalnum() or e == ' ') + ".txt" gutenberg_titles.append(filename[:-4]) text = strip_headers(load_etext(i)).strip() with open(os.path.join(current_path, "output", filename), "w") as output_file: output_file.write(text) titles = dict() # Searching for the books on Goodreads, reading their metadata for book_title in gutenberg_titles: try: lst = gc.search_books(book_title, search_field='title') if not lst: continue else: book = lst[0] titles[book.title] = ( book_title + ".txt", str(book.popular_shelves), str(book.similar_books), str(book.authors), dict(dict(book.work)['original_publication_year'])['#text']) except (request.GoodreadsRequestException, KeyError, TypeError): continue json.dump(titles, file, indent=4) file.close()
def get_book_text(csvfile): 'gets text for book using project gutenberg catalog' book_list = open_csv(csvfile) for i, value in enumerate(book_list): #print i, value a = int(book_list[i][2]) # a = book number print i, a author = book_list[i][0] title = book_list[i][1] try: text = strip_headers(load_etext(a)).strip() except ValueError: pass
def process_file(filename, outdir): outpath = outdir + '/%s.txt' with open(filename) as f: for line in f: spl = line.split('|') book = spl[0] uids = map(int, spl[3].strip(string.lowercase + '\n').split(',')) try: with open(outpath % book, 'w') as out: for uid in uids: raw_text = load_etext(uid) try: text = strip_headers(unidecode(raw_text.encode('latin-1').decode('utf-8'))) except UnicodeDecodeError: text = strip_headers(raw_text) out.write(text.encode('utf-8')) except ValueError as e: print '%s|%s' % (book, uid), e os.remove(outpath % book)
def the_text(self): try: self.novel = load_etext(self.novel_num) except: rejects.append(self.novel_num) return False if re.search('Character set encoding: ASCII', self.novel): self.novel = strip_headers(self.novel) self.novel = self.novel.replace('\n', ' ') self.novel = TextBlob(self.novel) self.novel_sentences = self.novel.sentences self.m = str(self.novel_num) with open('novel_'+self.m +'list_1.csv', 'wb') as f: writer = csv.writer(f) for sentence in self.novel_sentences: writer.writerow([sentence]) else: rejects_2.append(self.novel_num) return False
def check_text(): with open('raw.json') as inputfile: data = json.load(inputfile) for record in tqdm(data): id = record['metadata']['id'] title = clean(record['book']['title']) text = load_etext(id) if id in lookup_dates: release_date = lookup_dates[id] else: for line in text.split("\n"): if line.startswith('Release Date:'): release_date = line.replace('Release Date:', '').split('[')[0] break else: print id, title record['book']['author'] = record['author'] author_name = record['book']['author']['name'] vals.add(record['book']['author']['birth']) if record['book']['author']['birth'] == None: record['book']['author']['birth'] = 0 if record['book']['author']['death'] == None: record['book']['author']['death'] = 0 vals2.add(record['book']['author']['birth']) record['book']['author']['name'] = clean(author_name) if author_name else "Unknown" del record['author'] month, day, year = extract_date(release_date) release_date = release_date.strip() record['book']['publication'] = { 'full': release_date if month != 'Jan' else release_date.replace('Jan', 'January'), 'year': year, 'month name': month if month != 'Jan' else 'January', 'month': month_lookup[month], 'day': day } record['bibliography'] = record['book'] del record['book'] record['metrics'] = record['statistics'] del record['statistics'] with open('classics-2.json', 'w') as output: json.dump(data, output, indent=2)
def fetch_gutenberg(filename=None): from gutenberg.acquire import load_etext from gutenberg.cleanup import strip_headers from gutenbergsettings import popularTitles, saveInterval start = time.time() lastsave = start with redirect(filename): try: for title in popularTitles: text = strip_headers(load_etext(title)).strip() serialize([(title, text)], '../serialized/guten%s' % title) sinceLast = time.time() - lastsave print('%s since last save' % sinceLast) if sinceLast > saveInterval: concatenate('guten') lastsave = time.time() except KeyboardInterrupt: concatenate('guten') sys.exit(0)
def extract_subroutine(data, src_dir, century): session = model.get_session() backoff = 1 counter = 0 for metadata in data: contents = extract_book_contents(metadata) if contents is None: backoff *= 1.5 continue title = metadata['title'] author = metadata['author'] e_id = metadata['id'] if type(title) == list: title = dasherize(title) text_file_path = os.path.join(src_dir, dasherize(title.split(" "))) text = strip_headers(load_etext(e_id)).strip() f = open(text_file_path, "w") f.write(text) f.close() book = model.Book( title=title, author=author, e_id=e_id, century=century, text_file_path=text_file_path ) session.add(book) session.commit() log.info("successfully added " + title) counter += 1 time.sleep(backoff) log.info("---- finished run. added %d books ----" % counter)
def main(): filename = "gutenbergscifi.csv" json_filename = "gendered_words.json" if os.path.isfile(filename): print "file exists" else: write_csv = extract_text_urls(filename) print "file created" book_list = open_csv(filename) print book_list for i, value in enumerate(book_list): #print i, value a = int(book_list[i][2]) # a = book number print i, a author = book_list[i][0] title = book_list[i][1] try: text = strip_headers(load_etext(a)).strip() except ValueError: pass #print text clean_text = remove_punc_html(text) ratio = gender_ratio(clean_text) print author, title, ratio uber_key = author sub_key = title sub_value = ratio uber_value = {author: {title:ratio}} json_source = read_write_json_object(json_filename="gendered_words.json", uber_key=uber_key, uber_value=uber_value, sub_key=sub_key, sub_value=sub_value, READ=False, WRITE=True)
def download_book(title, gutenberg_id, data_path, sleep=0): print('downloading {:}'.format(title)) full_text = strip_headers(load_etext(gutenberg_id)).strip() summary = downloadSummary(title) if full_text is None: print('Full text is None. Skipping {:}'.format(title)) return if summary is None: print('Summary is None. Skipping {:}'.format(title)) return output_data = {'title': title, 'full_text': full_text, 'summary': summary} output_file = os.path.join(data_path, '{:}.json'.format(gutenberg_id)) with open(output_file, 'w') as f: json.dump(output_data, f, ensure_ascii=False) time.sleep(sleep)
def getBook(bookDetails): global timeAtLastFetch cachedFilename = "cache/" + fileNameForBook(bookDetails) + ".txt" if os.path.isfile(cachedFilename): with open(cachedFilename) as bookfile: text = bookfile.read() return TextBlob(text) nowMS = milliseconds() timeSinceLastFetch = nowMS - timeAtLastFetch if timeSinceLastFetch < gutenbergWaitTimeMS: waitTime = gutenbergWaitTimeMS - timeSinceLastFetch print " waiting {}ms for Gutenberg...".format(waitTime) time.sleep(waitTime / 1000) bookId = bookDetails['id'] print "Fetching from Gutenberg id {}".format(bookId) source = load_etext(bookId) print " cleaning...." source = removeUnicodeWords.sub("", source) source = removeUnicodeCharacters.sub("", source) source = removePossessive.sub("", source) source = removeWordsWithApostrophe.sub("", source) source = removeHyphens.sub(" ", source) source = removeChapterHeaders.sub("", source) source = removeRomanNumerals.sub("", source) source = removeEllipsis.sub("", source) text = strip_headers(source).strip() timeAtLastFetch = milliseconds() if not os.path.isdir("cache"): os.mkdir("cache") bookfile = open(cachedFilename, 'w') bookfile.write(text) bookfile.close() print " fetched and cached " + bookDetails['title'] return TextBlob(text)
def generate_tweets(gutenberg_id, total=24): document = [] text = strip_headers(load_etext(gutenberg_id)).strip() lines = text.split('\n') print get_metadata('title', gutenberg_id) for line in lines: words = re.findall(regex, line) document.extend(words) trigrams = zip(document, document[1:], document[2:]) trigram_transitions = defaultdict(list) starts = [] for prev, current, next in trigrams: if prev == ".": starts.append(current) trigram_transitions[(prev, current)].append(next) def generate_using_trigrams(): current = random.choice(starts) prev = "." result = [current] while True: next_word_candidates = trigram_transitions[(prev, current)] next_word = random.choice(next_word_candidates) prev, current = current, next_word if current != ".": result.append(current) else: return " ".join(result) + current tweets = []; while len(tweets) < total: tweet = generate_using_trigrams() if len(tweet) <= 140: tweets.append(tweet) return tweets
# -*- coding:utf-8 -*- # Librerias from gutenberg.acquire import load_etext from gutenberg.cleanup import strip_headers texto=load_etext(2000) texto=strip_headers(texto) #Reemplazar fragmento "qu" por "k" para que no se incluya "(q)ue" o "(q)ui" como diptongo #Reemplazar diptongos con "y" agregando "-" para que no encuentre palabras en las que la "y" es consonante y no vocal. texto = texto.replace("que", "ke") texto = texto.replace("qui", "ki") texto = texto.replace("gue", "ke") texto = texto.replace ("gui", "ki") texto = texto.replace ("ay", "ay-") texto = texto.replace (u"áy", u"áy-") texto = texto.replace ("ey", "ey-") texto = texto.replace (u"éy", u"éy-") texto = texto.replace ("oy", "oy-") texto = texto.replace ("uy", "uy-") texto = texto.lower() # Dividir texto en palabras # Diptongo: Combinación de una vocal abierta (/a e o/) con una cerrada (/i u/), o viceversa, la cerrada no debe ser tónica. # Hay que indicar con un espacio que la "y" debe quedar al final de palbra palabras=texto.split() dic_diptongos={
def split_sentences(text): for sentence_separator in [u'. ',u'.\n',u'? ',u'! ',u'?\n',u'!\n',u'; ',u';\n',u'- ',u'--',u'...',u'\n',u'\n\n',u'\n\n\n']: text=text.replace(sentence_separator,u'|||') return text.split(u'|||') # Saber la cantidad de libros que posee el corpus. print u'Total de libros en español:',len(codigos_libros.es) # Ahora se cargan los libros y se suprimen sus encabezados. dic_oraciones_es={} total_palabras_es=0 for codigo_libro_es in codigos_libros.es: texto=load_etext(codigo_libro_es) texto=strip_headers(texto) # En cada libro se separan las oraciones y se delimitan por el símbolo |||. oraciones_libro=split_sentences(texto) for oracion_libro in oraciones_libro: palabras=rufino.split_words(oracion_libro) numero_palabras_oracion=len(palabras) total_palabras_es+=numero_palabras_oracion if numero_palabras_oracion not in dic_oraciones_es: dic_oraciones_es[numero_palabras_oracion]=1 else: dic_oraciones_es[numero_palabras_oracion]=dic_oraciones_es[numero_palabras_oracion]+1 print u'Total de oraciones en español:',len(dic_oraciones_es)
import nltk from nltk.text import Text from gutenberg.acquire import load_etext from gutenberg.cleanup import strip_headers id = input("Input gutenberg id to load: ") text=strip_headers(load_etext(id)).strip() raw_input("Enter to print text preview...") print(text[:1000]) text = text.split() text=Text(text) def ask(): test = raw_input("Which analysis to perform ('list' to see list): ") if(test == "list"): print("concordance, dispersionplot, wordcount, lexicaldiversity, frequency, collocations") ask() if(test == "concordance"): conc = raw_input("word: ") text.concordance(conc) ask() if(test == "dispersionplot"): disp = [] keepasking = True; i=0; while(keepasking): input = raw_input("word " + str(i) + " (blank to stop): ") if(len(input) > 0): disp.append(input) else: keepasking = False;
17406, 17430, 17491, 20401, 21651, 23206, 23236, 24536, 24601, 24925, 25317, 25640, 25687, 25777, 25988, 26284, 26655, 27736, 29497, 29506, 29663, 29799, 29831, 30053, 30122, 30425, 30535, 30903, 30986, 31013, 31464, 31541, 31613, 31637, 31707, 32235, 32315, 32364, 33690, 35882, 36253, 36453, 36573, 36940, 37067, 37095, 37139, 37637, 38814, 39444, 39613, 39990, 41746, 42727,) guardar_cadena=0 cadena=u'' interjecciones={} contador = 0 for texto in textos: #Repito el ciclo para cada libro texto= load_etext(texto) #Cargo el texto texto=strip_headers(texto).lower() #Quito las cabeceras texto=unicode(texto) for caracter in texto: #recorro el texto caracter por caracter if caracter == u'¡': # Si encuentro una apertura de exclamación guardar_cadena=1 # Pongo una variable para empezar a guardar la cadena cadena=cadena+unicode(caracter) if caracter == u'!': # Si encuentro un cierre de exclamación cadena = cadena+unicode(caracter) # 1. Guardo ese último caractér (esto es opcional) if cadena in interjecciones.keys(): # 2. reviso si la cadena esta en el diccionario interjecciones[cadena]+=1 # 3. Si esta le sumo uno a su contador else: # 4. Si no esta interjecciones[cadena]=1 # La pongo y aranca desde 1 guardar_cadena=0 # 5. Cambio el valor de la variable para que no se guarde más cadena='' if guardar_cadena == 1: # 6. reviso si el valor de guardar cadena esta en 1
""" Created on Wed Aug 12 18:06:45 2015 @author: Tony Description: Pull etext numbers from Project Gutenberg for an author 1) First pip install gutenberg 0.4.0 library for Python from the command line """ from gutenberg.query import get_etexts from gutenberg.query import get_metadata from gutenberg.acquire import load_etext from gutenberg.cleanup import strip_headers # get the catalogue numbers of all the texts # by Wilhelm Grimm in Project Gutenberg bookList=get_etexts('author', 'Grimm, Wilhelm Carl') # gives bookList = [12704, 12705, 12706, 12707, 12708, 12709, 12710, 37381, 20051, 28044, 30510, 22555, 20050, 11027, 16846, 12250, 20027, 19068, 2591] #Once We can associate a number with a title we can pull the text for number in bookList: print(number,get_metadata('title',number)) print('\n HHHHHHHHHHHHHHH Now for the full text HHHHHHHHHHHHHHHHHHH \n') # Once we have the text number we can print the text # example 11027 is the number for Grimm's Fairy Stories # can be tempermental truncating text at top (console limit?) may need to trick around etext = strip_headers(load_etext(11027)).strip() print(etext)
def open_file(self, file_id): """ Opens a file from project gutenberg """ return load_etext(file_id)
print total i = 0 for (dirpath, dirnames, filenames) in walk(gutenberg_path): for filename in filenames: f = "/".join([dirpath, filename]) if(f.endswith(".rdf")): #print f i+=1 bf = BeautifulSoup(open(f)) subjects = bf.find_all("dcterms:subject") if (subjects is not None and len(subjects) > 0): for subject in subjects: val = subject.find_all("rdf:value")[0].contents[0] for i_subject in i_subjects: if(i_subject in val.lower()): #print f, val id = int(basename(f)[2:-4]) fn = str(id).zfill(10) + "_" + i_subject + ".txt" print fn try: text = strip_headers(load_etext(id)).strip().encode("utf-8") wf = "./texts/" + fn with open(wf, "w") as text_file: text_file.write(text) print i, total, float(i)/total except: print "broken", id # for network in tree.findtext('dcterms subject'): # print network
#Hecho en python 3.5 from gutenberg.acquire import load_etext from gutenberg.cleanup import strip_headers librosCodigo = {"Francés":[13735,13808],"Español":[24925,15027],"Portugés":[14904,16384],"Inglés":[10422,1013]} dic_idiomas={} for idioma in librosCodigo.keys(): diccionario_largo_palabras={} for indeCo in librosCodigo[idioma]: texto= strip_headers(load_etext(indeCo)) dic_idiomas[idioma]= diccionario_largo_palabras for caracter_especial in ['"',"...","¿","?","=","_","[","]","(",")",",",".",":",";","!","¡","«","»","*","~","' "," '","- "," -","--"]: texto=texto.replace(caracter_especial," ") palabras=texto.split() for palabra in palabras: largo_palabra = len(palabra) if largo_palabra in diccionario_largo_palabras: diccionario_largo_palabras[largo_palabra] = diccionario_largo_palabras[largo_palabra]+1 else: diccionario_largo_palabras[largo_palabra]= 1 print (dic_idiomas)
Created on Sun Sep 20 19:49:20 2015 @author: weizhi """ import nltk from nltk.corpus import gutenberg nltk.corpus.gutenberg.fileids() emma = nltk.corpus.gutenberg.words('austen-emma.txt') from gutenberg.acquire import load_etext from gutenberg.cleanup import strip_headers from gutenberg.acquire import metadata text = load_etext(201) print text[:100] #assert text.startswith('MOBY DICK; OR THE WHALE\n\nBy Herman Melville') import rdflib g = rdflib.Graph() from gutenberg.acquire import metadata output = metadata._create_metadata_graph(store='Sleepycat') #downLoad = metadata._download_metadata_archive() from gutenberg.query.api import get_metadata # noqa
def downloadText(textID): print "Downloading", textID text = strip_headers(load_etext(textID)).strip() return text
for item in ners: # Loop over the Stanford NER (per/ person) results, # and apply probablepeople, which raises when fails, (so try). if "per" in item["tag"].lower(): try: result = parse(item.get('string')) except: log.error("Could not run probablepeople") if result: result = parse(item["string"]) pp.append(result) ner["pp"] = pp return ner if __name__ == '__main__': if len(sys.argv) >= 2 and 'test' in " ".join(sys.argv): import doctest doctest.testmod(verbose=True) if len(sys.argv) >= 2 and 'profile' in " ".join(sys.argv): from gutenberg.acquire import load_etext from gutenberg.cleanup import strip_headers from pycallgraph import PyCallGraph from pycallgraph.output import GraphvizOutput text = smart_text(strip_headers(load_etext(54807)).strip()) with PyCallGraph(output=GraphvizOutput()): stanford_ner_wrapper(text, 9992, True)
except Exception, e: logging.error("%s: %s" % (path, e)) # raise e @classmethod def text_from_zip(cls, path, rdf_catalog_path=None): """Return a ProjectGutenbergText object from a zip file.""" archive = zipfile.ZipFile(path) inside = archive.filelist filenames = [x.filename for x in inside] if len(inside) != 1: logging.warn("Supposedly plain-text %s has %d files in zip: %s" % ( path, len(inside), ", ".join(filenames))) possibilities = [x for x in filenames if x.lower().endswith(".txt")] data = archive.read(possibilities[0]) return ProjectGutenbergText(data, path, rdf_catalog_path) @property def paragraphs(self): return self.text.split("\r\n\r\n") Obj = ProjectGutenbergText(text, name=None, rdf_catalog_path=raw_data) from gutenberg.acquire import load_etext from gutenberg.cleanup import strip_headers text = strip_headers(load_etext(2701)).strip() assert text.startswith('MOBY DICK; OR THE WHALE\n\nBy Herman Melville')
# -*- coding:utf-8 -*- from gutenberg.acquire import load_etext from gutenberg.cleanup import strip_headers dic_cont_interjecciones={} textos=load_etext(1619) texto=strip_headers(textos).lower() guardar_cadena=0 cadena='' interjecciones={} for texto in textos: #Repito el ciclo para cada libro for caracter in texto: #recorro el texto caracter por caracter if caracter == u'¡': guardar_cadena=1 if caracter == u'!': cadena = cadena+caracter if cadena in interjecciones.keys(): interjecciones[cadena]+=1 else: interjecciones[cadena]=1 guardar_cadena=0 if guardar_cadena == 1: cadena = cadena+caracter for interjeccion in interjecciones.keys().sort(): print interjeccion, interjecciones[interjeccion]
from numpy import random from gutenberg.acquire import load_etext from gutenberg.cleanup import strip_headers bookNumber = set(random.randint(10,50024,size=2000)) #f.write(foo.encode('utf8')) metaInfo = [] for item in bookNumber: # print item try: # print item # loading the raw txt data = load_etext(item).split("\n") # save the txt data path filePath = rdfPath + '/' +str(item) + '/' + str(item) + '.txt' f = open(filePath,'w') f.write(data.encode('utf8')) f.close() # get the meta data Dict = obj.metaData(data) metaInfo.append((Dict,filePath)) print len(metaInfo) except: continue #%%do the data mining to these txt, author, title, release time, etc, need time to work on this part