def __init__(self, text): # pass in the text, and process the text self.original_striped_text = strip_headers(text).strip() self.text = strip_headers(text).strip() self.sentences = [] self.word_frequencies = {} self.sentence_scores = {} self.preprocesstext()
def generate_paragraph(): ''' Generates a random paragraph from the Gutenberg Project :return: Text the Guttenberg Project with spaces and non-alphabetic characters removed\ and all characters lower case :rtype: str ''' #Get the text from Gutenberg Project, in this case its Moby Dick text = strip_headers(load_etext(2701)).strip() #text = "Jack and Jill ran up the hill to get a pail of water. " + # "Jack fell down and broke his crown and Jill came tumbling after." sentences = [] paragraph = "" for sentence in text.split("."): sentences.append(sentence) #Select 2 random sentences paragraph = random.choice(sentences) + random.choice(sentences) paragraph = re.sub(r'\s+', '', paragraph) regex = re.compile('[^a-zA-Z]') paragraph = regex.sub('', paragraph).lower() return paragraph
def analyze(file, mode=None, stripmap=None): "This function analyzes a plane text file, optionally skipping Gutenberg/Wikipedia header and footer, and returns a dictionary (mapping of keys to values) of words to their frequencies. A map of characters to strip from each word may also be provided for efficiency purposes if calling this function multiple times (as we do for this experiment), but if none is provided it will be generated before processing." #Res is an empty dictionary which we will populate with key/value pairs and ultimately return. res = {} #If we don't have a stripmap, generate one. if stripmap == None: stripmap = generate_stripmap() #We need to determine the character encoding of the file for processing. import chardet enc = chardet.detect(open(file, 'br').read())['encoding'] #Fin is the file object. We will open it using the detected encoding to collect its text (sans headers if Gutenberg), then close it. fin = open(file, encoding=enc, errors='ignore') #words is an empty list which we will populate with all words from the source text. words = [] if mode == 'Gutenberg': from gutenberg.cleanup import strip_headers text = strip_headers(''.join(fin.readlines())) elif mode == 'Wikipedia': text = '' for line in fin: if "<doc" not in line and "</doc" not in line: text += line else: text = ''.join(fin.readlines()) fin.close() #The text we've extracted is full of punctuation, capitalization, and newlines which are undesirable for our purposes. We just want the words. for word in text.split(): words.extend(word.translate(stripmap).lower().split()) #Analyze words, and generate our frequency map. for word in words: res[word] = res.setdefault(word, 0) + 1 return res
def generate_samples(self, data_dir, tmp_dir, dataset_split): del data_dir del tmp_dir del dataset_split # pylint: disable=g-import-not-at-top from gutenberg import acquire from gutenberg import cleanup # pylint: enable=g-import-not-at-top books = [ # bookid, skip N lines (19221, 223), (15553, 522), ] for (book_id, toskip) in books: text = cleanup.strip_headers(acquire.load_etext(book_id)).strip() lines = text.split("\n")[toskip:] prev_line = None ex_count = 0 for line in lines: # Any line that is all upper case is a title or author name if not line or line.upper() == line: prev_line = None continue line = re.sub("[^a-z]+", " ", line.strip().lower()) if prev_line and line: yield { "inputs": prev_line, "targets": line, } ex_count += 1 prev_line = line
def text_invalidates_entry(text): """ Determine if there is anything obvious in the text that would invalidate it as a valid novel >>> from gender_novels.corpus_gen import text_invalidates_entry >>> text_invalidates_entry("Translator: George Fyler Townsend") True >>> from gender_novels.corpus_gen import get_novel_text_gutenberg >>> import os >>> current_dir = os.path.abspath(os.path.dirname(__file__)) >>> filepath = Path(current_dir, r"corpora/sample_novels/texts/hawthorne_scarlet.txt") >>> scarlet_letter = get_novel_text_gutenberg(filepath) >>> text_invalidates_entry(scarlet_letter) False :param text: str :return: boolean """ if text.find("Translator: ", 0, 650) != -1: return True text = strip_headers(text) text_length = len(text) # Animal Farm is roughly 166700 characters including boilerplate # Guiness World Records states that the longest novel is 9,609,000 characters long if text_length < 140000 or text_length > 9609000: return True return False
def generate_samples(self, data_dir, tmp_dir, dataset_split): del data_dir del tmp_dir del dataset_split books = [ # bookid, skip N lines (19221, 223), (15553, 522), ] for (book_id, toskip) in books: text = cleanup.strip_headers(acquire.load_etext(book_id)).strip() lines = text.split("\n")[toskip:] for line in lines: # Any line that is all upper case is a title or author name if not line or line.upper() == line: continue line = re.sub("[^a-z]+", " ", line.strip().lower()) if line: l = len(line) if l > 100: l = 100 yield { "inputs": line, "label": l, }
def sanitize_texts(directory): """ Strip all header and copyright information from downloaded text files in the specified directory using gutenberg.strip_headers module and ensure proper file encodings. :param directory: <String> A string containing the full path to directory containing files to strip :return: """ for item in os.listdir(directory): file_path = os.path.join(directory, item) if os.path.isfile(file_path): # Detect file encoding, takes time to run with open(file_path, 'rb') as inf: text = inf.readlines() detector = UniversalDetector() for line in text: detector.feed(line) if detector.done: break detector.close() encoding = detector.result['encoding'] # Open file, strip headers, and save result with open(file_path, 'r', encoding=encoding) as inf: text = inf.read() text = strip_headers(text).strip() os.remove(file_path) with open(file_path, 'w+', encoding=encoding) as outf: outf.write(text)
def getSomeBooks(howManyBooks, startingAt=1): i = howManyBooks ii = startingAt nothing = 0 valError = 0 otherError = 0 allTheBooks = [] while i > len(allTheBooks): # 54096 ceiling try: theText = strip_headers( load_etext(ii)).strip() #load the full text into theText theLength = len(theText) if len(theText) > 292: allTheBooks.append([ii, theText]) print("one more book in the list, book number:", i, "book total is:", len(allTheBooks)) else: nothing = nothing + 1 print("nothing here at number:", i) except ValueError: valError = valError + 1 print("valueError at book number:", i) except: otherError = otherError + 1 print("otherError at book number:", i) ii = ii + 1 print('all done') print(len(allTheBooks)) return allTheBooks
def post_corpora(url, auth_token): corpora = acquire_corpora() text = strip_headers(load_etext(corpora[0])).strip() print(corpora, text[:100]) authentication_token = {'authentication-token': auth_token} # data to post files = {'file': io.StringIO(text)} data = { 'label': '{} {}'.format(corpora[1], corpora[3]), 'source': corpora[2] } # post ru = requests.post(url, headers=authentication_token, files=files, data=data) print(ru.url, ru.status_code) if ru.ok: print(ru.json()) else: print(ru.status_code, ru.reason)
def sample_paragraphs(book_id, n_parag, min_length): """Get book as text file and randomly sample a fixed number of paragraphs.""" # Get book as string and emove metadata book = load_etext(book_id) # Remove metadata book = strip_headers(book).strip() # Remove the character we'll choose as separator book = book.replace("|", " ") # Split paragraphs parag = book.split("\n\n") # Remove single line breaks parag = [x.replace("\n", " ") for x in parag] # Remove paragraphs below a certain length parag = [p for p in parag if len(p) > min_length] # Exclude first/last 10 parag from sampling as they may contain remaining metadata parag = parag[10:-10] # Sample paragraphs seed(42) sample_ind = randint(0, len(parag), n_parag) if n_parag is not None: if n_parag > len(parag): raise ValueError( "The number of paragraphs to sample is higher than the " "total number of paragraphs." ) else: parag_sampled = [parag[i] for i in sample_ind] else: # If n_parag is None, all paragraphs are sampled parag_sampled = parag return parag_sampled
def load_macbeth(): """ Sources Macbeth from Project Gutenberg, returns a cleaned dataframe of the play split by act, scene, speaker, and sentence. """ raw_text = load_etext(1533) # Collect the text raw_text = strip_headers(raw_text) # Remove most metadata # Remove in-line stage directions raw_text = remove_in_line_stage_directions(raw_text) # Split the text into sentences sentences = separate_sentences(raw_text) # Remove introductory data, keeping only the text sentences = sentences[110:] # Create a dataframe from the sentences macbeth = create_play_data_frame(sentences) # Clean the dataframe macbeth = clean_macbeth(macbeth) # Add a token column macbeth["tokens"] = create_token_column(macbeth["sentence"]) # Return the finished dataframe return macbeth
def download(cfg): print('Downloading Gutenberg data to: ' + cfg.directory) # Load language data for all books. path = os.path.join('code', 'utils', 'metadata.txt') with open(path, encoding='utf-8') as f: counter = 0 for line in f: [index, lang, r, author, title] = line.split('\t') r = int(r) i = int(index) if counter < cfg.max_books and r == 1 and lang in cfg.languages: # Get the book. try: text = strip_headers(load_etext(i)).strip().encode('utf-8') except UnknownDownloadUriException: print('Could not download book: ' + str(i)) continue # Save the file to the correct directory. path = os.path.join(cfg.directory, lang) if not os.path.exists(path): os.mkdir(path) with open(os.path.join(path, str(i) + '.txt'), 'wb') as f: f.write(text) counter += 1 if not counter % 1000: print('Downloaded ' + str(counter) + ' books')
def get_gutenberg_text(id): try: text = strip_headers(load_etext(id)).strip() return text except Exception as ex: print(ex) return ''
def load_gutenberg(self, language='en'): texts = get_etexts('author', self.author) texts = { t: list(get_metadata("title", t))[0] for t in texts if list(get_metadata("language", t))[0] == language } new_texts = dict() dupes = list() for k, d in texts.items(): d = d.replace("\r\n", " ") if d not in dupes: dupes.append(d) new_texts[k] = d try: self.books[d] = strip_headers( load_etext(k)).strip().split("\r\n\r\n") except UnknownDownloadUriException: print( f'Book "{d}" does not have a text format and was not loaded.' ) del new_texts[k] dupes.remove(d) continue self.tokens[d] = [ nltk.pos_tag(nltk.word_tokenize(self.books[d][b])) for b in range(len(self.books[d])) ] else: pass texts = new_texts print(texts)
def delete_introduction_ending(full_text): # this function deletes extranious text from Peter_Pan_full_text from gutenberg.cleanup import strip_headers no_intro = strip_headers(full_text).strip() # print no_intro print'headers are gone' return no_intro
def main(): """ The main method. """ parser = argparse.ArgumentParser( description='Word suggestion based on Project Gutenberg books.') parser.add_argument('--book-id', dest='book_ids', nargs='+', type=int, required=True, help='the book id of the Project Gutenberg') parser.add_argument('--query', nargs='+', type=str, required=True, help='suggest next word for list of string', action=required_length(1, 5)) try: args = parser.parse_args() text_indexer = TextIndexer(len(args.query)) for book_id in list(dict.fromkeys(args.book_ids)): text = strip_headers(load_etext(book_id)).strip() text_indexer.add_text(book_id, text) print(text_indexer.suggest(*args.query)) except Exception as exc: # pylint: disable=W0703 print(exc)
def get_featurelists(book): # Preparation for topic features: get 100 most common uni-, bi- and trigrams of the given book common_ngrams = get_common_ngrams(book) # Extract the features of the given book features_book = (book_id, extract_features(book, common_ngrams)) # Create new file and write the features of the given book to it path_feat_book = "C:/Users/gebruiker/Documents/Master/Text/Project/output_data/features_book.txt" with open(path_feat_book, 'r+', encoding="utf-8") as output_book: output_book.write(str(features_book)) output_book.close() # Create new file to write the features of the dataset books to path_feat_books = "C:/Users/gebruiker/Documents/Master/Text/Project/output_data/features_dataset.txt" output_dataset = open(path_feat_books, 'r+', encoding="utf-8") # Extract the features of the dataset books features_dataset = [] for i in IDs: features_dataset.append((i, extract_features( strip_headers(load_etext(i)).strip(), common_ngrams))) # Write the features to the output file output_dataset.write("\n Book " + str(i) + ": ") output_dataset.write(str(features_dataset[len(features_dataset) - 1])) output_dataset.close() return features_book, features_dataset
def poetry_cleaner(poetry_books=BOOKS): with open(INPUT_DATA_WRITE_PATH + OUT_PATH, 'w') as ofp: lineno = 0 for (id_nr, toskip, title) in poetry_books: startline = lineno text = strip_headers(load_etext(id_nr)).strip() lines = text.split('\n')[toskip:] for line in lines: if 0 < len(line) < 50 and line.upper( ) != line and not re.match('.*[0-9]+.*', line): cleaned = re.sub('[^a-z\'\-]+', ' ', line.strip().lower()) if lineno < 100: ofp.write(cleaned) ofp.write('\n') lineno = lineno + 1 else: ofp.write('\n') print('Wrote lines {} to {} from {}'.format(startline, lineno, title))
def init_books(author_file, json_file): """initialize book list with texts and save it to disk""" with open(author_file) as f: authors = list(f) authors = [i.strip() for i in authors] books = [] for author in authors: s = get_etexts('author', author) for i in s: try: if list(get_metadata('language', i))[0] == 'en': title, etext = list(get_metadata( 'title', i))[0], strip_headers(load_etext(i)).strip() b = Book(i, title, etext) books.append(b) except UnknownDownloadUriException: # this book does not have a load_etext corresponding to it. pass with open(json_file, 'wb') as f: pickle.dump(books, f) print(len(books))
def analyze(file,mode=None,stripmap=None): "This function analyzes a plane text file, optionally skipping Gutenberg/Wikipedia header and footer, and returns a dictionary (mapping of keys to values) of words to their frequencies. A map of characters to strip from each word may also be provided for efficiency purposes if calling this function multiple times (as we do for this experiment), but if none is provided it will be generated before processing." #Res is an empty dictionary which we will populate with key/value pairs and ultimately return. res={} #If we don't have a stripmap, generate one. if stripmap==None: stripmap=generate_stripmap() #We need to determine the character encoding of the file for processing. import chardet enc=chardet.detect(open(file,'br').read())['encoding'] #Fin is the file object. We will open it using the detected encoding to collect its text (sans headers if Gutenberg), then close it. fin=open(file,encoding=enc,errors='ignore') #words is an empty list which we will populate with all words from the source text. words=[] if mode=='Gutenberg': from gutenberg.cleanup import strip_headers text=strip_headers(''.join(fin.readlines())) elif mode=='Wikipedia': text='' for line in fin: if "<doc" not in line and "</doc" not in line: text+=line else: text=''.join(fin.readlines()) fin.close() #The text we've extracted is full of punctuation, capitalization, and newlines which are undesirable for our purposes. We just want the words. for word in text.split(): words.extend(word.translate(stripmap).lower().split()) #Analyze words, and generate our frequency map. for word in words: res[word]=res.setdefault(word,0)+1 return res
def get_joyce_texts(): joyce_keys = get_etexts('author', 'Joyce, James') joyce_titles = [] joyce_texts = {} for key in joyce_keys: joyce_titles.append(get_metadata('title', key)) joyce_texts[key] = strip_headers(load_etext(key)).strip() return (joyce_texts)
def regular_view(request, book_num): name = Book.get_book_name(book_num) bookText = strip_headers(load_etext(book_num)).strip() filteredText = removeStopWords(bookText) args = {'content': [bookText], 'content2': [filteredText], 'name': name} return render(request, "pages/regularText.html", args)
def get_raw_book(): while True: try: text = load_etext(random.randrange(46000)) #46000 is approximately size of gutenberg catalogue except ValueError: #in case of no download method for that text id pass else: return strip_headers(text)
def search_display_options(my_catalog): search_result_catalog = book_catalog() search_type = input( 'Please select a search type: Author, Subject, Title [Aa/Ss/Tt]: ') if search_type == 'A' or search_type == 'a': search_term = input('Please enter a search term for an Author: ') elif search_type == 'T' or search_type == 't': search_term = input('Please enter a search term for a Title: ') elif search_type == 'S' or search_type == 's': search_term = input('Please enter a search term for a Subject: ') else: print('Invalid search type...') return # set match flag to false match = False # fill up a set of all the titles that match the search for my_book in my_catalog.get_books(): if (search_type == 'a' or search_type == 'A') and set( my_book.get_book_author().lower().split(' ')).intersection( set(search_term.lower().split(' '))): search_result_catalog.add_book(my_book) match = True if (search_type == 't' or search_type == 'T') and set( my_book.get_book_title().lower().split(' ')).intersection( set(search_term.lower().split(' '))): search_result_catalog.add_book(my_book) match = True if (search_type == 's' or search_type == 'S') and set( my_book.get_book_subject().lower().split(' ')).intersection( set(search_term.lower().split(' '))): search_result_catalog.add_book(my_book) match = True search_result_catalog.display_titles_by_author() if match: title_num = input('Please type a title number from the above list: ') print('Displaying Word Cloud in [Subject: ' + my_book.get_book_subject() + '] for [Title: ' + my_book.get_book_title() + '] by [Author:' + my_book.get_book_author() + ']') try: my_book = search_result_catalog.get_book(title_num) return (strip_headers(load_etext(int(title_num))).strip() ) # call that gets bok text from gutenberg except: print('Failed to find a textual download candidate for ' + my_book.get_book_title()) return (None) else: print('No matches found for [' + search_term + ']...') return (None)
def tab(): with open("BookRoulette.html", "w") as f: x = (random.randint(1, 60059)) book = strip_headers(load_etext(x)).strip() f.write(book) f.close filename = 'file:///'+os.getcwd()+'/' + 'BookRoulette.html' webbrowser.open_new_tab(filename) return render_template('BookRoulette.html', book=book)
def process_file(filename, outdir): outpath = outdir + '/%s.txt' with open(filename) as f: for line in f: spl = line.split('|') book = spl[0] uids = map(int, spl[3].strip(string.lowercase + '\n').split(',')) try: with open(outpath % book, 'w') as out: for uid in uids: raw_text = load_etext(uid) try: text = strip_headers(unidecode(raw_text.encode('latin-1').decode('utf-8'))) except UnicodeDecodeError: text = strip_headers(raw_text) out.write(text.encode('utf-8')) except ValueError as e: print '%s|%s' % (book, uid), e os.remove(outpath % book)
def acquire_and_process(name: str, txt_num: int): """ Convenience function that minhashes a Project Gutenberg text given the text id number (can be found on the gutenberg.org, for instance in the url). """ txt = strip_headers(load_etext(txt_num)) with open("texts/%s.txt" % name, "w") as f: f.write(txt) process_file("texts/%s.txt" % name)
def download(): with open("GutProject.doc", "w") as f: x = (random.randint(1, 60059)) text = strip_headers(load_etext(x)).strip() f.write(text) f.close() return send_file('GutProject.doc', mimetype='application/msword', attachment_filename='GutProject.doc', as_attachment=True)
def loadNew(self): test = os.listdir(self.new)[0] testB = open(self.new + test) raw = testB.read() text = strip_headers(raw).strip() text = text.replace('\n', ' ') text = text.replace(':', '. ') text = sent_tokenize(text) text = list(filter(lambda x: len(x) > 5, text)) return text
def get_gutenberg_text(book_id): """ This function gets the text corresponding to the book_id from Gutenberg database. """ try: x = strip_headers(load_etext(int(book_id), prefer_ascii=False)).strip() except: x = None return x
def downloadBook(): """If posting, takes in a book number from getty.html, installs the book into the database. Otherwise displays getty.html""" if request.method == "POST": bookNum = int(request.form.get("bookNum")) words = strip_headers(load_etext(bookNum)).strip() installText(words) return render_template("homepage.html") else: return render_template("getty.html")
def __init__(self, book_number=2701, first_page=20, last_page=20): self.text = strip_headers(load_etext(book_number)) # print(list_supported_metadatas()) # prints (u'author', u'formaturi', u'language', ...) # print(get_metadata('title', 2701)) # prints frozenset([u'Moby Dick; Or, The Whale']) # print(get_metadata('author', 2701)) # prints frozenset([u'Melville, Hermann']) # print(text) # prints 'MOBY DICK; OR THE WHALE\n\nBy Herman Melville ...' self.pages = [] self.first_page = first_page self.last_page = last_page self.print_book()
def gutToText(number,name): filename = name+"_raw.txt" if os.path.isfile(filename)==False: book = open(filename,"w") text = strip_headers(load_etext(number)).strip() words = text print "Loaded and writing %s" % (name) book.write(words.encode('utf-8')) print "Done writing %s" % (name) book.close()
def get_gutenberg_document(url) -> str: """Downloads a document (book, etc.) from Project Gutenberg and returns it as a string.""" # Get Project Gutenberg document ID from url string validate_url(url, expected_netloc='gutenberg.org') match = re.search("(?:files|ebooks|epub)\/(\d+)", urlsplit(url).path) if not match: raise Exception('Not a valid url') document_id = int(match.group(1)) return super_cleaner(strip_headers(load_etext(document_id).strip()), mark_deletions=False)
def acquire_and_process(name: str, txt_num: int): """ Convenience function that minhashes a Project Gutenberg text given the text id number (can be found on the gutenberg.org, for instance in the url). """ txt = strip_headers( load_etext(txt_num) ) with open("texts/%s.txt" % name, "w") as f: f.write(txt) process_file("texts/%s.txt" % name)
def get_text(self, title, author): """ This function will access the title and author of a book from the Gutenberg project and save the data as a csv file PROBLEM HERE -- gutenberg goes down a lot, so getting a full text did not work. To bypass that, I downloaded some books of mixed languages. """ guten_number = get_etexts('title', title)[0] text = strip_headers(load_etext(guten_number)).strip() return (text)
def fetch_etext(etextno): """Returns a unicode representation of the full body of a Project Gutenberg text. After making an initial remote call to Project Gutenberg's servers, the text is persisted locally. """ download_uri = _format_download_uri(etextno) if download_uri: response = requests.get(download_uri) return strip_headers(response.text.strip()).encode('utf-8') return ''
def get_word_list(): """ Reads the specified project Gutenberg book. Header comments, punctuation, and whitespace are stripped away. The function returns a list of the words used in the book as a list. All words are converted to lower case. """ textfile = open("pg32325.txt") full_text = textfile.read() no_punctuation = full_text.translate(None, string.punctuation) no_intro = strip_headers(no_punctuation).strip() convert_ascii = no_intro.encode("ascii") convert_lowercase = string.lower(convert_ascii) list_split = convert_lowercase.split() return list_split
def get_book_text(csvfile): 'gets text for book using project gutenberg catalog' book_list = open_csv(csvfile) for i, value in enumerate(book_list): #print i, value a = int(book_list[i][2]) # a = book number print i, a author = book_list[i][0] title = book_list[i][1] try: text = strip_headers(load_etext(a)).strip() except ValueError: pass
def split_chapters(self, full_text): """ Removes header and footer from project gutenberg book. Makes a list of chapters, where each chapter is a sublist of paragraphs """ book = strip_headers(full_text) chapter_list = re.split(ur'\n\bchapter\b \w+\.?', book, flags=re.IGNORECASE) if len(chapter_list) < 2: chapter_list = re.split(ur'\n[IVXCLM]+\n', book) paragraphs_in_chapter_list = [] for i in range(len(chapter_list)): paragraphs_in_chapter_list.append(chapter_list[i].split('\n\r')) # return len(paragraphs_in_chapter_list) return paragraphs_in_chapter_list
def the_text(self): try: self.novel = load_etext(self.novel_num) except: rejects.append(self.novel_num) return False if re.search('Character set encoding: ASCII', self.novel): self.novel = strip_headers(self.novel) self.novel = self.novel.replace('\n', ' ') self.novel = TextBlob(self.novel) self.novel_sentences = self.novel.sentences self.m = str(self.novel_num) with open('novel_'+self.m +'list_1.csv', 'wb') as f: writer = csv.writer(f) for sentence in self.novel_sentences: writer.writerow([sentence]) else: rejects_2.append(self.novel_num) return False
def process_file(direc, file): """ Given a file and directory, extracts the title and author from the file if it's an English language text. Then strips the header information using Gutenberg module and stores the new file in a nameddir directory with filename "title%%%author.txt" for future processing. :param direc: <String> Path to directory containing the file, no trailing '/' :param file: <String> Name of the file """ # Grab author and title from the top if language is English title = "" author = "" lang = False text = "" enc = 'ISO-8859-1' path = direc + '/' + file with open(path, 'r', encoding=enc) as inf: text = inf.read() inf.seek(0) # reset buffer to read author and title for line in inf: if "Title:" in line: title = line.replace("Title: ", "").strip() if "Author:" in line: author = line.replace("Author: ", "").strip() if "Language:" in line and "English" in line: lang = True break # Generate new file name like 'Title%%%Author.txt' for easy lookup filename = title[:min(100, len(title))].replace("/", "") + delim\ + author[:min(100, len(author))].replace("/", "") + ".txt" # Remove copyright and metadata from the file text = strip_headers(text).strip() # Save the file in 'nameddir' (global var) directory if lang: with open(nameddir + '/' + filename, 'w+') as outf: outf.write(text)
def fetch_gutenberg(filename=None): from gutenberg.acquire import load_etext from gutenberg.cleanup import strip_headers from gutenbergsettings import popularTitles, saveInterval start = time.time() lastsave = start with redirect(filename): try: for title in popularTitles: text = strip_headers(load_etext(title)).strip() serialize([(title, text)], '../serialized/guten%s' % title) sinceLast = time.time() - lastsave print('%s since last save' % sinceLast) if sinceLast > saveInterval: concatenate('guten') lastsave = time.time() except KeyboardInterrupt: concatenate('guten') sys.exit(0)
def test_strip_headers(self): for testcase in SampleText.all(): expected = testcase.clean_text.splitlines() actual = strip_headers(testcase.raw_text).splitlines() lines = zip(actual, expected) for i, (actual_line, expected_line) in enumerate(lines, start=1): self.assertEqual( actual_line, expected_line, u'non-matching lines:\n' u'{previous_lines}\n' u'{lineno_separator}\n' u'got "{actual}"\n' u'expected "{expected}"\n' u'{separator}\n' u'{next_lines}'.format( previous_lines=_previous_lines(i, lines, amount=3), next_lines=_next_lines(i, lines, amount=3), actual=actual_line, expected=expected_line, lineno_separator='line {}:'.format(i).center(80, '-'), separator=''.center(80, '-')))
def main(): filename = "gutenbergscifi.csv" json_filename = "gendered_words.json" if os.path.isfile(filename): print "file exists" else: write_csv = extract_text_urls(filename) print "file created" book_list = open_csv(filename) print book_list for i, value in enumerate(book_list): #print i, value a = int(book_list[i][2]) # a = book number print i, a author = book_list[i][0] title = book_list[i][1] try: text = strip_headers(load_etext(a)).strip() except ValueError: pass #print text clean_text = remove_punc_html(text) ratio = gender_ratio(clean_text) print author, title, ratio uber_key = author sub_key = title sub_value = ratio uber_value = {author: {title:ratio}} json_source = read_write_json_object(json_filename="gendered_words.json", uber_key=uber_key, uber_value=uber_value, sub_key=sub_key, sub_value=sub_value, READ=False, WRITE=True)
def extract_subroutine(data, src_dir, century): session = model.get_session() backoff = 1 counter = 0 for metadata in data: contents = extract_book_contents(metadata) if contents is None: backoff *= 1.5 continue title = metadata['title'] author = metadata['author'] e_id = metadata['id'] if type(title) == list: title = dasherize(title) text_file_path = os.path.join(src_dir, dasherize(title.split(" "))) text = strip_headers(load_etext(e_id)).strip() f = open(text_file_path, "w") f.write(text) f.close() book = model.Book( title=title, author=author, e_id=e_id, century=century, text_file_path=text_file_path ) session.add(book) session.commit() log.info("successfully added " + title) counter += 1 time.sleep(backoff) log.info("---- finished run. added %d books ----" % counter)
def test_strip_headers(self): for testcase in SampleText.all(): expected = testcase.clean_text.splitlines() actual = strip_headers(testcase.raw_text).splitlines() lines = list(zip(actual, expected)) for i, (actual_line, expected_line) in enumerate(lines, start=1): self.assertTrue( actual_line == expected_line, u('non-matching lines for etext {etextno}:\n' '{previous_lines}\n' '{lineno_separator}\n' 'got "{actual}"\n' 'expected "{expected}"\n' '{separator}\n' '{next_lines}') .format( etextno=testcase.etextno, previous_lines=_previous_lines(i, lines, amount=3), next_lines=_next_lines(i, lines, amount=3), actual=actual_line, expected=expected_line, lineno_separator='line {0}:'.format(i).center(80, '-'), separator=''.center(80, '-')))
def download_book(title, gutenberg_id, data_path, sleep=0): print('downloading {:}'.format(title)) full_text = strip_headers(load_etext(gutenberg_id)).strip() summary = downloadSummary(title) if full_text is None: print('Full text is None. Skipping {:}'.format(title)) return if summary is None: print('Summary is None. Skipping {:}'.format(title)) return output_data = {'title': title, 'full_text': full_text, 'summary': summary} output_file = os.path.join(data_path, '{:}.json'.format(gutenberg_id)) with open(output_file, 'w') as f: json.dump(output_data, f, ensure_ascii=False) time.sleep(sleep)
def getBook(bookDetails): global timeAtLastFetch cachedFilename = "cache/" + fileNameForBook(bookDetails) + ".txt" if os.path.isfile(cachedFilename): with open(cachedFilename) as bookfile: text = bookfile.read() return TextBlob(text) nowMS = milliseconds() timeSinceLastFetch = nowMS - timeAtLastFetch if timeSinceLastFetch < gutenbergWaitTimeMS: waitTime = gutenbergWaitTimeMS - timeSinceLastFetch print " waiting {}ms for Gutenberg...".format(waitTime) time.sleep(waitTime / 1000) bookId = bookDetails['id'] print "Fetching from Gutenberg id {}".format(bookId) source = load_etext(bookId) print " cleaning...." source = removeUnicodeWords.sub("", source) source = removeUnicodeCharacters.sub("", source) source = removePossessive.sub("", source) source = removeWordsWithApostrophe.sub("", source) source = removeHyphens.sub(" ", source) source = removeChapterHeaders.sub("", source) source = removeRomanNumerals.sub("", source) source = removeEllipsis.sub("", source) text = strip_headers(source).strip() timeAtLastFetch = milliseconds() if not os.path.isdir("cache"): os.mkdir("cache") bookfile = open(cachedFilename, 'w') bookfile.write(text) bookfile.close() print " fetched and cached " + bookDetails['title'] return TextBlob(text)
def generate_tweets(gutenberg_id, total=24): document = [] text = strip_headers(load_etext(gutenberg_id)).strip() lines = text.split('\n') print get_metadata('title', gutenberg_id) for line in lines: words = re.findall(regex, line) document.extend(words) trigrams = zip(document, document[1:], document[2:]) trigram_transitions = defaultdict(list) starts = [] for prev, current, next in trigrams: if prev == ".": starts.append(current) trigram_transitions[(prev, current)].append(next) def generate_using_trigrams(): current = random.choice(starts) prev = "." result = [current] while True: next_word_candidates = trigram_transitions[(prev, current)] next_word = random.choice(next_word_candidates) prev, current = current, next_word if current != ".": result.append(current) else: return " ".join(result) + current tweets = []; while len(tweets) < total: tweet = generate_using_trigrams() if len(tweet) <= 140: tweets.append(tweet) return tweets
for item in ners: # Loop over the Stanford NER (per/ person) results, # and apply probablepeople, which raises when fails, (so try). if "per" in item["tag"].lower(): try: result = parse(item.get('string')) except: log.error("Could not run probablepeople") if result: result = parse(item["string"]) pp.append(result) ner["pp"] = pp return ner if __name__ == '__main__': if len(sys.argv) >= 2 and 'test' in " ".join(sys.argv): import doctest doctest.testmod(verbose=True) if len(sys.argv) >= 2 and 'profile' in " ".join(sys.argv): from gutenberg.acquire import load_etext from gutenberg.cleanup import strip_headers from pycallgraph import PyCallGraph from pycallgraph.output import GraphvizOutput text = smart_text(strip_headers(load_etext(54807)).strip()) with PyCallGraph(output=GraphvizOutput()): stanford_ner_wrapper(text, 9992, True)
for index, record in df.iterrows(): # Get the key url = record['url'] key = bucket.get_key(url) if key is None: #Remove the utf8 extension from url utf8_extension = url.rfind('.utf8') if (utf8_extension != -1): url = url[0:utf8_extension] key = bucket.get_key(url) if key is None: continue contents = key.get_contents_as_string() contents = unicode(contents, 'utf-8') book_text = strip_headers(contents).strip() book_length = len(book_text) noise_size = int(book_length * 0.05) #Compute offsets for content start_offset = noise_size end_offset = book_length - noise_size #Remove the noise from book text document = book_text[start_offset:end_offset] #Truncate the document at full stops start = document.find('.') end = document.rfind('.') if ((start != -1) and (end != -1)): document = document[start+1:end+1] #Remove special characters and digits pattern = '[^\w+.\s+,:;?\'-]' prog = re.compile(pattern,re.UNICODE)
def split_sentences(text): for sentence_separator in [u'. ',u'.\n',u'? ',u'! ',u'?\n',u'!\n',u'; ',u';\n',u'- ',u'--',u'...',u'\n',u'\n\n',u'\n\n\n']: text=text.replace(sentence_separator,u'|||') return text.split(u'|||') # Saber la cantidad de libros que posee el corpus. print u'Total de libros en español:',len(codigos_libros.es) # Ahora se cargan los libros y se suprimen sus encabezados. dic_oraciones_es={} total_palabras_es=0 for codigo_libro_es in codigos_libros.es: texto=load_etext(codigo_libro_es) texto=strip_headers(texto) # En cada libro se separan las oraciones y se delimitan por el símbolo |||. oraciones_libro=split_sentences(texto) for oracion_libro in oraciones_libro: palabras=rufino.split_words(oracion_libro) numero_palabras_oracion=len(palabras) total_palabras_es+=numero_palabras_oracion if numero_palabras_oracion not in dic_oraciones_es: dic_oraciones_es[numero_palabras_oracion]=1 else: dic_oraciones_es[numero_palabras_oracion]=dic_oraciones_es[numero_palabras_oracion]+1 print u'Total de oraciones en español:',len(dic_oraciones_es) print u'Total de palabras en español:',total_palabras_es
""" Created on Wed Aug 12 18:06:45 2015 @author: Tony Description: Pull etext numbers from Project Gutenberg for an author 1) First pip install gutenberg 0.4.0 library for Python from the command line """ from gutenberg.query import get_etexts from gutenberg.query import get_metadata from gutenberg.acquire import load_etext from gutenberg.cleanup import strip_headers # get the catalogue numbers of all the texts # by Wilhelm Grimm in Project Gutenberg bookList=get_etexts('author', 'Grimm, Wilhelm Carl') # gives bookList = [12704, 12705, 12706, 12707, 12708, 12709, 12710, 37381, 20051, 28044, 30510, 22555, 20050, 11027, 16846, 12250, 20027, 19068, 2591] #Once We can associate a number with a title we can pull the text for number in bookList: print(number,get_metadata('title',number)) print('\n HHHHHHHHHHHHHHH Now for the full text HHHHHHHHHHHHHHHHHHH \n') # Once we have the text number we can print the text # example 11027 is the number for Grimm's Fairy Stories # can be tempermental truncating text at top (console limit?) may need to trick around etext = strip_headers(load_etext(11027)).strip() print(etext)
25777, 25988, 26284, 26655, 27736, 29497, 29506, 29663, 29799, 29831, 30053, 30122, 30425, 30535, 30903, 30986, 31013, 31464, 31541, 31613, 31637, 31707, 32235, 32315, 32364, 33690, 35882, 36253, 36453, 36573, 36940, 37067, 37095, 37139, 37637, 38814, 39444, 39613, 39990, 41746, 42727,) guardar_cadena=0 cadena=u'' interjecciones={} contador = 0 for texto in textos: #Repito el ciclo para cada libro texto= load_etext(texto) #Cargo el texto texto=strip_headers(texto).lower() #Quito las cabeceras texto=unicode(texto) for caracter in texto: #recorro el texto caracter por caracter if caracter == u'¡': # Si encuentro una apertura de exclamación guardar_cadena=1 # Pongo una variable para empezar a guardar la cadena cadena=cadena+unicode(caracter) if caracter == u'!': # Si encuentro un cierre de exclamación cadena = cadena+unicode(caracter) # 1. Guardo ese último caractér (esto es opcional) if cadena in interjecciones.keys(): # 2. reviso si la cadena esta en el diccionario interjecciones[cadena]+=1 # 3. Si esta le sumo uno a su contador else: # 4. Si no esta interjecciones[cadena]=1 # La pongo y aranca desde 1 guardar_cadena=0 # 5. Cambio el valor de la variable para que no se guarde más cadena='' if guardar_cadena == 1: # 6. reviso si el valor de guardar cadena esta en 1 cadena = cadena+unicode(caracter) #Si está sumo el siguiente caracter y repito el ciclo
def downloadText(textID): print "Downloading", textID text = strip_headers(load_etext(textID)).strip() return text
#Hecho en python 3.5 from gutenberg.acquire import load_etext from gutenberg.cleanup import strip_headers librosCodigo = {"Francés":[13735,13808],"Español":[24925,15027],"Portugés":[14904,16384],"Inglés":[10422,1013]} dic_idiomas={} for idioma in librosCodigo.keys(): diccionario_largo_palabras={} for indeCo in librosCodigo[idioma]: texto= strip_headers(load_etext(indeCo)) dic_idiomas[idioma]= diccionario_largo_palabras for caracter_especial in ['"',"...","¿","?","=","_","[","]","(",")",",",".",":",";","!","¡","«","»","*","~","' "," '","- "," -","--"]: texto=texto.replace(caracter_especial," ") palabras=texto.split() for palabra in palabras: largo_palabra = len(palabra) if largo_palabra in diccionario_largo_palabras: diccionario_largo_palabras[largo_palabra] = diccionario_largo_palabras[largo_palabra]+1 else: diccionario_largo_palabras[largo_palabra]= 1 print (dic_idiomas)
except Exception, e: logging.error("%s: %s" % (path, e)) # raise e @classmethod def text_from_zip(cls, path, rdf_catalog_path=None): """Return a ProjectGutenbergText object from a zip file.""" archive = zipfile.ZipFile(path) inside = archive.filelist filenames = [x.filename for x in inside] if len(inside) != 1: logging.warn("Supposedly plain-text %s has %d files in zip: %s" % ( path, len(inside), ", ".join(filenames))) possibilities = [x for x in filenames if x.lower().endswith(".txt")] data = archive.read(possibilities[0]) return ProjectGutenbergText(data, path, rdf_catalog_path) @property def paragraphs(self): return self.text.split("\r\n\r\n") Obj = ProjectGutenbergText(text, name=None, rdf_catalog_path=raw_data) from gutenberg.acquire import load_etext from gutenberg.cleanup import strip_headers text = strip_headers(load_etext(2701)).strip() assert text.startswith('MOBY DICK; OR THE WHALE\n\nBy Herman Melville')
print total i = 0 for (dirpath, dirnames, filenames) in walk(gutenberg_path): for filename in filenames: f = "/".join([dirpath, filename]) if(f.endswith(".rdf")): #print f i+=1 bf = BeautifulSoup(open(f)) subjects = bf.find_all("dcterms:subject") if (subjects is not None and len(subjects) > 0): for subject in subjects: val = subject.find_all("rdf:value")[0].contents[0] for i_subject in i_subjects: if(i_subject in val.lower()): #print f, val id = int(basename(f)[2:-4]) fn = str(id).zfill(10) + "_" + i_subject + ".txt" print fn try: text = strip_headers(load_etext(id)).strip().encode("utf-8") wf = "./texts/" + fn with open(wf, "w") as text_file: text_file.write(text) print i, total, float(i)/total except: print "broken", id # for network in tree.findtext('dcterms subject'): # print network