def model_structure(): """Train the model.""" if not TEXTFILE.exists(): text = '' for path in BUILDDIR.glob('*.epub'): book = open_book(path) lines = convert_epub_to_lines(book) for line in lines: soup = BeautifulSoup(line) text += soup.get_text() TEXTFILE.write_text(text) x, y, charidx = textfile_to_semi_redundant_sequences(TEXTFILE, seq_maxlen=SEQ_MAXLEN) g = tflearn.input_data([None, SEQ_MAXLEN, len(charidx)]) g = tflearn.lstm(g, 512, return_seq=True) g = tflearn.dropout(g, 0.5) g = tflearn.lstm(g, 512, return_seq=True) g = tflearn.dropout(g, 0.5) g = tflearn.lstm(g, 512) g = tflearn.dropout(g, 0.5) g = tflearn.fully_connected(g, len(charidx), activation='softmax') g = tflearn.regression(g, optimizer='adam', loss='categorical_crossentropy', learning_rate=0.001) model = tflearn.SequenceGenerator(g, dictionary=charidx, seq_maxlen=SEQ_MAXLEN, clip_gradients=5.0, checkpoint_path=MODEL) return model, x, y, charidx
def EpubtoTxt(): global File from epub_conversion.utils import open_book, convert_epub_to_lines, convert_lines_to_text book = open_book(File) lines = convert_epub_to_lines(book) f = open(Output_File, "a", encoding="utf-8") for a in lines: f.write(''.join(convert_lines_to_text(a))) f.close() td.cleanup()
def book_read(book_name): book = open_book(str(book_name)) lines = convert_epub_to_lines(book) print(len(lines)) s = ("\n".join(lines)) #text=re.sub("<[^>]*>","",s text = get_text(s) text = re.sub("[ ]+", " ", text) text = re.sub("[\n]+", "\n", text) return text
def book_read(book_name): book = open_book(str(book_name)) lines = convert_epub_to_lines(book) print(len(lines)) s = ("\n".join(lines)) #text=re.sub("<[^>]*>","",s with open(str(book_name) + ".html", "w", encoding="utf-8") as fp: fp.write(s) text = get_text(s) text = re.sub("[ ]+", " ", text) text = re.sub("[\n]+", "\n", text) return text
def convert(target_path): epub_paths = get_files_from_path(".epub", friendimorphs_path) with gzip.open(target_path, "wb") as file: for (epub_path, epub_name) in epub_paths: book = open_book(epub_path) if book is not None: for sentence in convert_lines_to_text(convert_epub_to_lines(book)): file.write(sentence.encode("utf-8")) print("Wrote \"%s\" to disk" % (epub_name)) else: print("Couldn't open \"%s\"." % (epub_name))
def consume_epub(self, filepath, title="untitled epub file"): "Take an epub file as input and create qa pairs" book = open_book(filepath) text = " ".join(convert_epub_to_lines(book)) text = re.sub("<.*?>", "", text) text = text.replace(" ", " ") text = text.replace("‐", "-") text = re.sub("&.*?;", " ", text) # make paragraph limitation as expected in self.consume_var: text = text.replace("\r", "\n\n") text = re.sub("\n\n\n*", "\n\n", text) text = self._sanitize_text(text) self.consume_var(text, title, per_paragraph=True)
def get_text(text_filename): article_text = '' if path.exists('sources/' + text_filename + ".txt"): with open('sources/' + text_filename + '.txt', encoding="utf8") as file: article_text = file.read().replace('\n', ' ') elif path.exists('sources/' + text_filename + ".epub"): book = open_book('sources/' + text_filename + ".epub") convertedBook = convert_epub_to_lines(book) article_text = ' '.join(convertedBook) elif path.exists('sources/' + text_filename + ".pdf"): raw = parser.from_file('sources/' + text_filename + ".pdf") print(raw['content']) article_text = raw['content'] return article_text
def _parse_book(epub_file): """ Convert an epub file to a list of text strings. """ book = open_book(epub_file) lines = convert_epub_to_lines(book) ret = [] for line in lines: if line != '' and line != '\n': ret.append(strip_tags(line)) return ret
def epub2txt(path, extension): """ converts an epub to a .txt file using ebooklib, returns the new .txt path""" outputPath = path.replace(extension, ".txt") from epub_conversion.utils import open_book, convert_epub_to_lines from xml_cleaner import to_raw_text lines = convert_epub_to_lines(open_book(path)) for line in lines: line = to_raw_text(line, keep_whitespace=True)[0] # we strip out markup if len(line) > 15 and line[ 0] != "<": # we only keep longer lines to avoid titles and pagination line = "".join(line) + "\n" with open(outputPath, "a") as f: f.write(line) return outputPath
def train(): if not TEXTFILE.exists(): text = '' for path in BUILDDIR.glob('*.epub'): print(f'======= {path} =======') book = open_book(path) lines = convert_epub_to_lines(book) for line in lines: soup = BeautifulSoup(line) text += soup.get_text() print(text[:150]) TEXTFILE.write_text(text) x, y, charidx = textfile_to_semi_redundant_sequences(TEXTFILE, seq_maxlen=5)
def get_source_text(): if path.exists('sources/' + USERDATA_.text_filename + ".txt"): with open('sources/' + USERDATA_.text_filename + '.txt', encoding="utf8") as file: source_text = file.read().replace('\n', ' ') elif path.exists('sources/' + USERDATA_.text_filename + ".epub"): book = open_book('sources/' + USERDATA_.text_filename + ".epub") convertedBook = convert_epub_to_lines(book) source_text = ' '.join(convertedBook) elif path.exists('sources/' + USERDATA_.text_filename + ".pdf"): raw = parser.from_file('sources/' + USERDATA_.text_filename + ".pdf") source_text = raw['content'] elif path.exists('sources/' + USERDATA_.text_filename + ".srt"): source_text = convert_srt_to_text('sources/' + USERDATA_.text_filename + ".srt") return source_text
def open_file(self): try: file = askopenfilename(parent=self.master) os.makedirs(f"{sys.path[0]}/ebooks", exist_ok=True) if not os.path.isfile( f"{sys.path[0]}/ebooks/{Path(file).stem}.epub"): copy(file, f"{sys.path[0]}/ebooks") new_file = f"{Path(file).stem}.txt" if not os.path.isdir(f"{sys.path[0]}/ebook_text"): os.mkdir(f"{sys.path[0]}/ebook_text") if not os.path.isfile(f"{sys.path[0]}/ebook_text/{new_file}"): print("Reading epub to text..") book = open_book(file) lines = convert_epub_to_lines(book) with open(f"{sys.path[0]}/ebook_text/{new_file}", 'w', encoding='utf-8') as f: for line in lines: f.writelines(" ".join( re.split( ', |_|\.|\?|;|,|:|-|!|\+|\.\.\.|–|”|…|\(|\)', (BeautifulSoup(line, 'html.parser').text.lower() + "\r\n")))) print("Done..") else: print(f"Text file found.") with open(f"{sys.path[0]}/ebook_text/{new_file}", 'r', encoding='utf-8') as p: print("Generating word list..") text = p.read() words = sorted(set(text.split())) setattr(self, 'word_list', words) print("Done..") except FileNotFoundError as file_not_found: tk.messagebox.showwarning(title="Warning", message=file_not_found) except Exception as error: tk.messagebox.showerror(title="Unknown Error", message=error)
def process_file(file_path): if '.txt' in file_path: with open(file_path) as file: text = file.read() return text if '.epub' in file_path: book = open_book(file_path) lines = convert_epub_to_lines(book) html = ' '.join(lines) html = html.replace('</body>', ' ') html = html.replace('</html>', ' ') soup = BeautifulSoup(html, 'lxml') text = '' for node in soup.findAll('p'): text += ''.join(node.findAll(text=True)) + '\n' return text
def linesFromBook(bookTitle): book = open_book(bookTitle) lines = convert_epub_to_lines( book) #Convert lines in book from epub into a textfile allMyRealLines = list() #Obtain allMyRealLines for line in lines: cleanline = mylinetoclean(line) # mylinetoclean is run. (Tags removed) cleanline = "\r" + cleanline + "\r" # The tagless lines are formatted on seperate lines. print(cleanline) allMyRealLines.append( cleanline ) #Add the clean formatted lines into a new list called allMyRealLines return allMyRealLines
def ePubConverter_lineByline(base_path): from epub_conversion.utils import open_book, convert_epub_to_lines, convert_lines_to_text local_tree_list = Ln.TreeList(base_path, 'eBook') for relative_folder_path in local_tree_list: full_folder_path = os.path.join( base_path, relative_folder_path).rstrip(os.path.sep) files = [ f for f in os.listdir(full_folder_path) if os.path.isfile(os.path.join(full_folder_path, f)) ] folders = [ f for f in os.listdir(full_folder_path) if os.path.isdir(os.path.join(full_folder_path, f)) ] for file in files: filename = os.path.join(full_folder_path, file) book = open_book(filename) lines = convert_epub_to_lines(book) ltext = convert_lines_to_text(lines)
###### MODIFY CODE HERE ######## path = "path/to/epubs/directory/" output = "name_of_output_file.txt" ###### END OF MODIFY CODE HERE ########### from epub_conversion.utils import open_book, convert_epub_to_lines import os import re from string import digits cleanr = re.compile('<.*?>') remove_digits = str.maketrans('', '', digits) final = [] for filename in os.listdir(path): ret = [] book = open_book(path + filename) lines = convert_epub_to_lines(book) count = 0 length = len(lines) maxim = length - 300 for i in range(len(lines)): cleantext = re.sub(cleanr, '', lines[i]) sos = cleantext[:-1].rstrip() sos = sos.translate(remove_digits) if sos.rstrip() != "" and count > 100 and count < maxim: ret.append(cleantext[:-1].lstrip()) count += 1 for i in range(len(ret)): line = ret[i] sos = re.split(r'\W+', line) if len(sos) > 2:
def __init__(self, filename): self.__filename = filename self.book = open_book(self.filename + ".epub") self.lines = convert_epub_to_lines(self.book)
def __init__(self, filename): f = open(filename) f.close() self.book = open_book(filename) self.lines = convert_epub_to_lines(self.book)
def main(): engine = create_engine('sqlite:///recipes.db') Base.metadata.bind = engine DBSession = sessionmaker(bind=engine) session = DBSession() book = open_book('data/books/TheFoodLab.epub') lines = convert_epub_to_lines(book) with open('f.txt', 'w') as f: for l in lines: f.write(l + '\n') parser = MyHTMLParser() in_recipe = False got_ing = False got_inst = False recipe_count = 0 recipe_dict = {} week_count = 0 lc = 0 for line in tqdm(lines): if 'recipe_rt' in line: in_recipe = True parser.feed(line) recipe_title = parser.data.title() ingredients = [] instructions = [] try: url = BingImages(recipe_title, count=1).get()[0] except: url = None if in_recipe: if 'recipe_i' in line: parser.feed(line) ingredient = parser.data.title() ingredients.append(ingredient) got_ing = True elif 'recipe_rsteps' in line: parser.feed(line) instruction = parser.data.title() instructions.append(instruction) got_inst = True if lc > 0: prev = lines[lc - 1] if got_inst and got_ing and 'recipe_rsteps' in prev: recipe_count += 1 got_inst = False got_ing = False in_recipe = False recipe = Recipes(title=recipe_title, ingredients=str(ingredients), instructions=str(instructions), url=url) session.add(recipe) if week_count < 3: week_recipe = Week(id=recipe.id, slot_num=week_count + 1) session.add(week_recipe) session.commit() week_count += 1 lc += 1 session.commit()
from epub_conversion.utils import get_files_from_path, convert_epub_to_lines, convert_lines_to_text, open_book book = open_book( "Alan Miller, Satoshi Kanazawa Why Beautiful People Have More Daughters From Dating, Shopping, and Praying to Going to War and Becoming a Billionaire.epub" ) if book is not None: file = open('book.txt', 'ab') for sentence in convert_lines_to_text(str(convert_epub_to_lines(book)), book): file.write(sentence.encode("utf-8")) print("Wrote \"%s\" to disk" % (book)) file.close() else: print("Couldn't open \"%s\"." % (book))
def downloadBook(): app_log.info(log_sep) dict_json = {} num_arg = request.args['id'].split(",") map_obj = map(int, num_arg) num_list = list(map_obj) os.makedirs(f"app/books/", exist_ok=True) #check for strip parameter (optional) if 'strip' in request.args: strip = request.args['strip'].lower() else: strip = "true" for num in num_list: url = f"https://www.gutenberg.org/cache/epub/{num}/pg{num}.txt" filename = f"app/books/{num}.json" if (not (bookCheck(num))): try: response = requests.get(url) if (response.status_code == 404 ): #if txt file doesnt exsist, check for epub response = requests.get( f"https://www.gutenberg.org/cache/epub/{num}/pg{num}.epub" ) response.raise_for_status() app_log.info("Converting epub...") print("Converting epub...") os.makedirs(f"app/tmp/", exist_ok=True) with open("app/tmp/temp.epub", "wb") as f: f.write(response.content) f.close() book = open_book("app/tmp/temp.epub") lines = convert_epub_to_lines(book) data = '\n'.join(lines) book.close() cleanr = re.compile('<.*?>') #removes html encoding data = re.sub(cleanr, '', data) shutil.rmtree("app/tmp") else: response.raise_for_status() data = response.content.decode() except requests.exceptions.HTTPError as a: app_log.info(a) return a except requests.exceptions.RequestException as e: app_log.info(e) return e with open(filename, 'w') as outfile: dict_json[str(num)] = data temp = {} temp[str(num)] = data json.dump(temp, outfile) LRU(num) else: LRU(num) with open(filename) as json_file: dict_json.update(json.load(json_file)) if (strip == "true"): for book_text in dict_json: dict_json[book_text] = gutenberg_cleaner.simple_cleaner( dict_json[book_text]) #return json.dumps(f, indent = 4) return jsonify(dict_json)
def _read_epub(path: Union[str, os.PathLike]) -> str: book = open_book(path) raw = "\n".join(convert_epub_to_lines(book)) return raw
def convert(name): book = open_book(name) lines = convert_epub_to_lines(book) return lines