コード例 #1
0
def model_structure():
    """Train the model."""

    if not TEXTFILE.exists():
        text = ''
        for path in BUILDDIR.glob('*.epub'):
            book = open_book(path)
            lines = convert_epub_to_lines(book)
            for line in lines:
                soup = BeautifulSoup(line)
                text += soup.get_text()
        TEXTFILE.write_text(text)

    x, y, charidx = textfile_to_semi_redundant_sequences(TEXTFILE,
                                                         seq_maxlen=SEQ_MAXLEN)

    g = tflearn.input_data([None, SEQ_MAXLEN, len(charidx)])
    g = tflearn.lstm(g, 512, return_seq=True)
    g = tflearn.dropout(g, 0.5)
    g = tflearn.lstm(g, 512, return_seq=True)
    g = tflearn.dropout(g, 0.5)
    g = tflearn.lstm(g, 512)
    g = tflearn.dropout(g, 0.5)
    g = tflearn.fully_connected(g, len(charidx), activation='softmax')
    g = tflearn.regression(g,
                           optimizer='adam',
                           loss='categorical_crossentropy',
                           learning_rate=0.001)
    model = tflearn.SequenceGenerator(g,
                                      dictionary=charidx,
                                      seq_maxlen=SEQ_MAXLEN,
                                      clip_gradients=5.0,
                                      checkpoint_path=MODEL)

    return model, x, y, charidx
コード例 #2
0
def textfinder(name):
    # 'name' is the name of the file in question being converted
    # Spits out a text file without html markings
    book = open_book(f"FData/{name}")
    text = CETL(book)
    soup = BeautifulSoup(" ".join(text))
    return soup.get_text()
コード例 #3
0
def book_read(book_name):
    book = open_book(str(book_name))
    lines = convert_epub_to_lines(book)
    print(len(lines))
    s = ("\n".join(lines))
    #text=re.sub("<[^>]*>","",s
    text = get_text(s)
    text = re.sub("[ ]+", " ", text)
    text = re.sub("[\n]+", "\n", text)
    return text
コード例 #4
0
def EpubtoTxt():
    global File
    from epub_conversion.utils import open_book, convert_epub_to_lines, convert_lines_to_text
    book = open_book(File)
    lines = convert_epub_to_lines(book)
    f = open(Output_File, "a", encoding="utf-8")
    for a in lines:
        f.write(''.join(convert_lines_to_text(a)))
    f.close()
    td.cleanup()
コード例 #5
0
    def readepub(self, fpath):
        list_text = []
        book = open_book(fpath)
        lines = ec.utils.convert_epub_to_lines(book)
        for line in lines:

            text = ec.utils.convert_lines_to_text(str(line), "txt")
            text = list(text)
            for ele in text:
                list_text.append(ele)
        return list_text
コード例 #6
0
def book_read(book_name):
    book = open_book(str(book_name))
    lines = convert_epub_to_lines(book)
    print(len(lines))
    s = ("\n".join(lines))
    #text=re.sub("<[^>]*>","",s
    with open(str(book_name) + ".html", "w", encoding="utf-8") as fp:
        fp.write(s)
    text = get_text(s)
    text = re.sub("[ ]+", " ", text)
    text = re.sub("[\n]+", "\n", text)
    return text
コード例 #7
0
def textfinder(name):
    # 'name' is the name of the file in question being converted
    # Spits out a text file without html markings
    #This requires boto in order to get the appropriate S3 file
    #Then requires BytesIO in order to convert the bytes into epub format
    #Then epub_conversion converts it into HTML style text
    obj = s3.Object("ficsuggest", name)
    sobj = obj.get()['Body'].read()
    stream = BytesIO(sobj)
    book = open_book(stream)
    text = CETL(book)
    soup = BeautifulSoup(" ".join(text), features="lxml")
    return soup.get_text()
コード例 #8
0
def convert(target_path):

    epub_paths = get_files_from_path(".epub", friendimorphs_path)

    with gzip.open(target_path, "wb") as file:
        for (epub_path, epub_name) in epub_paths:
            book = open_book(epub_path)
            if book is not None:
                for sentence in convert_lines_to_text(convert_epub_to_lines(book)):
                    file.write(sentence.encode("utf-8"))
                print("Wrote \"%s\" to disk" % (epub_name))
            else:
                print("Couldn't open \"%s\"." % (epub_name))
コード例 #9
0
ファイル: autocards.py プロジェクト: Psionica/Autocards
 def consume_epub(self, filepath, title="untitled epub file"):
     "Take an epub file as input and create qa pairs"
     book = open_book(filepath)
     text = " ".join(convert_epub_to_lines(book))
     text = re.sub("<.*?>", "", text)
     text = text.replace("&nbsp;", " ")
     text = text.replace("&dash;", "-")
     text = re.sub("&.*?;", " ", text)
     # make paragraph limitation as expected in self.consume_var:
     text = text.replace("\r", "\n\n")
     text = re.sub("\n\n\n*", "\n\n", text)
     text = self._sanitize_text(text)
     self.consume_var(text, title, per_paragraph=True)
コード例 #10
0
def epub2txt(path, extension):
    """ converts an epub to a .txt file using ebooklib, returns the new .txt path"""
    outputPath = path.replace(extension, ".txt")
    from epub_conversion.utils import open_book, convert_epub_to_lines
    from xml_cleaner import to_raw_text
    lines = convert_epub_to_lines(open_book(path))
    for line in lines:
        line = to_raw_text(line,
                           keep_whitespace=True)[0]  # we strip out markup
        if len(line) > 15 and line[
                0] != "<":  # we only keep longer lines to avoid titles and pagination
            line = "".join(line) + "\n"
            with open(outputPath, "a") as f:
                f.write(line)
    return outputPath
コード例 #11
0
ファイル: ankiarticle.py プロジェクト: tjthejuggler/ankimaker
def get_text(text_filename):
    article_text = ''
    if path.exists('sources/' + text_filename + ".txt"):
        with open('sources/' + text_filename + '.txt',
                  encoding="utf8") as file:
            article_text = file.read().replace('\n', ' ')
    elif path.exists('sources/' + text_filename + ".epub"):
        book = open_book('sources/' + text_filename + ".epub")
        convertedBook = convert_epub_to_lines(book)
        article_text = ' '.join(convertedBook)
    elif path.exists('sources/' + text_filename + ".pdf"):
        raw = parser.from_file('sources/' + text_filename + ".pdf")
        print(raw['content'])
        article_text = raw['content']
    return article_text
コード例 #12
0
ファイル: epub_to_txt.py プロジェクト: hicsail/corpus
def _parse_book(epub_file):
    """
    Convert an epub file to a list of text strings.
    """

    book = open_book(epub_file)
    lines = convert_epub_to_lines(book)

    ret = []

    for line in lines:
        if line != '' and line != '\n':
            ret.append(strip_tags(line))

    return ret
コード例 #13
0
def get_source_text():
    if path.exists('sources/' + USERDATA_.text_filename + ".txt"):
        with open('sources/' + USERDATA_.text_filename + '.txt',
                  encoding="utf8") as file:
            source_text = file.read().replace('\n', ' ')
    elif path.exists('sources/' + USERDATA_.text_filename + ".epub"):
        book = open_book('sources/' + USERDATA_.text_filename + ".epub")
        convertedBook = convert_epub_to_lines(book)
        source_text = ' '.join(convertedBook)
    elif path.exists('sources/' + USERDATA_.text_filename + ".pdf"):
        raw = parser.from_file('sources/' + USERDATA_.text_filename + ".pdf")
        source_text = raw['content']
    elif path.exists('sources/' + USERDATA_.text_filename + ".srt"):
        source_text = convert_srt_to_text('sources/' +
                                          USERDATA_.text_filename + ".srt")
    return source_text
コード例 #14
0
ファイル: main.py プロジェクト: RomichL/bigdata19.case03
def train():

    if not TEXTFILE.exists():
        text = ''
        for path in BUILDDIR.glob('*.epub'):
            print(f'======= {path} =======')
            book = open_book(path)
            lines = convert_epub_to_lines(book)
            for line in lines:
                soup = BeautifulSoup(line)
                text += soup.get_text()
            print(text[:150])
        TEXTFILE.write_text(text)

    x, y, charidx = textfile_to_semi_redundant_sequences(TEXTFILE,
                                                         seq_maxlen=5)
コード例 #15
0
    def open_file(self):
        try:
            file = askopenfilename(parent=self.master)

            os.makedirs(f"{sys.path[0]}/ebooks", exist_ok=True)
            if not os.path.isfile(
                    f"{sys.path[0]}/ebooks/{Path(file).stem}.epub"):
                copy(file, f"{sys.path[0]}/ebooks")

            new_file = f"{Path(file).stem}.txt"

            if not os.path.isdir(f"{sys.path[0]}/ebook_text"):
                os.mkdir(f"{sys.path[0]}/ebook_text")

            if not os.path.isfile(f"{sys.path[0]}/ebook_text/{new_file}"):
                print("Reading epub to text..")
                book = open_book(file)

                lines = convert_epub_to_lines(book)
                with open(f"{sys.path[0]}/ebook_text/{new_file}",
                          'w',
                          encoding='utf-8') as f:
                    for line in lines:
                        f.writelines(" ".join(
                            re.split(
                                ', |_|\.|\?|;|,|:|-|!|\+|\.\.\.|–|”|…|\(|\)',
                                (BeautifulSoup(line,
                                               'html.parser').text.lower() +
                                 "\r\n"))))
                print("Done..")
            else:
                print(f"Text file found.")

            with open(f"{sys.path[0]}/ebook_text/{new_file}",
                      'r',
                      encoding='utf-8') as p:
                print("Generating word list..")
                text = p.read()
                words = sorted(set(text.split()))

                setattr(self, 'word_list', words)
                print("Done..")
        except FileNotFoundError as file_not_found:
            tk.messagebox.showwarning(title="Warning", message=file_not_found)
        except Exception as error:
            tk.messagebox.showerror(title="Unknown Error", message=error)
コード例 #16
0
ファイル: BookCleaning.py プロジェクト: Nirmolk/LeapMotion
def linesFromBook(bookTitle):

    book = open_book(bookTitle)

    lines = convert_epub_to_lines(
        book)  #Convert lines in book from epub into a textfile

    allMyRealLines = list()  #Obtain allMyRealLines

    for line in lines:
        cleanline = mylinetoclean(line)  # mylinetoclean is run. (Tags removed)
        cleanline = "\r" + cleanline + "\r"  # The tagless lines are formatted on seperate lines.
        print(cleanline)
        allMyRealLines.append(
            cleanline
        )  #Add the clean formatted lines into a new list called allMyRealLines

    return allMyRealLines
コード例 #17
0
    def process_file(file_path):
        if '.txt' in file_path:
            with open(file_path) as file:
                text = file.read()
            return text

        if '.epub' in file_path:
            book = open_book(file_path)
            lines = convert_epub_to_lines(book)
            html = ' '.join(lines)
            html = html.replace('</body>', ' ')
            html = html.replace('</html>', ' ')
            soup = BeautifulSoup(html, 'lxml')
            text = ''

            for node in soup.findAll('p'):
                text += ''.join(node.findAll(text=True)) + '\n'
            return text
コード例 #18
0
ファイル: eBookConverter.py プロジェクト: Loreton/eBooks
def ePubConverter_lineByline(base_path):
    from epub_conversion.utils import open_book, convert_epub_to_lines, convert_lines_to_text

    local_tree_list = Ln.TreeList(base_path, 'eBook')
    for relative_folder_path in local_tree_list:
        full_folder_path = os.path.join(
            base_path, relative_folder_path).rstrip(os.path.sep)

        files = [
            f for f in os.listdir(full_folder_path)
            if os.path.isfile(os.path.join(full_folder_path, f))
        ]
        folders = [
            f for f in os.listdir(full_folder_path)
            if os.path.isdir(os.path.join(full_folder_path, f))
        ]

    for file in files:
        filename = os.path.join(full_folder_path, file)
        book = open_book(filename)
        lines = convert_epub_to_lines(book)
        ltext = convert_lines_to_text(lines)
コード例 #19
0
ファイル: convertor.py プロジェクト: frederictaieb/meiso
	def __init__(self, filename):
		self.__filename = filename
		self.book = open_book(self.filename + ".epub")
		self.lines = convert_epub_to_lines(self.book)
コード例 #20
0
ファイル: tools.py プロジェクト: frederictaieb/meiso
 def __init__(self, filename):
     f = open(filename)
     f.close()
     self.book = open_book(filename)
     self.lines = convert_epub_to_lines(self.book)
コード例 #21
0
def downloadBook():
    app_log.info(log_sep)
    dict_json = {}
    num_arg = request.args['id'].split(",")
    map_obj = map(int, num_arg)
    num_list = list(map_obj)
    os.makedirs(f"app/books/", exist_ok=True)
    #check for strip parameter (optional)
    if 'strip' in request.args:
        strip = request.args['strip'].lower()
    else:
        strip = "true"

    for num in num_list:
        url = f"https://www.gutenberg.org/cache/epub/{num}/pg{num}.txt"
        filename = f"app/books/{num}.json"
        if (not (bookCheck(num))):
            try:
                response = requests.get(url)
                if (response.status_code == 404
                    ):  #if txt file doesnt exsist, check for epub
                    response = requests.get(
                        f"https://www.gutenberg.org/cache/epub/{num}/pg{num}.epub"
                    )
                    response.raise_for_status()
                    app_log.info("Converting epub...")
                    print("Converting epub...")
                    os.makedirs(f"app/tmp/", exist_ok=True)
                    with open("app/tmp/temp.epub", "wb") as f:
                        f.write(response.content)
                        f.close()
                    book = open_book("app/tmp/temp.epub")
                    lines = convert_epub_to_lines(book)
                    data = '\n'.join(lines)
                    book.close()
                    cleanr = re.compile('<.*?>')  #removes html encoding
                    data = re.sub(cleanr, '', data)
                    shutil.rmtree("app/tmp")

                else:
                    response.raise_for_status()
                    data = response.content.decode()

            except requests.exceptions.HTTPError as a:
                app_log.info(a)
                return a
            except requests.exceptions.RequestException as e:
                app_log.info(e)
                return e

            with open(filename, 'w') as outfile:
                dict_json[str(num)] = data
                temp = {}
                temp[str(num)] = data
                json.dump(temp, outfile)
                LRU(num)
        else:
            LRU(num)
            with open(filename) as json_file:
                dict_json.update(json.load(json_file))

        if (strip == "true"):
            for book_text in dict_json:
                dict_json[book_text] = gutenberg_cleaner.simple_cleaner(
                    dict_json[book_text])

    #return json.dumps(f, indent = 4)
    return jsonify(dict_json)
コード例 #22
0
from epub_conversion.utils import get_files_from_path, convert_epub_to_lines, convert_lines_to_text, open_book

book = open_book(
    "Alan Miller, Satoshi Kanazawa Why Beautiful People Have More Daughters From Dating, Shopping, and Praying to Going to War and Becoming a Billionaire.epub"
)
if book is not None:
    file = open('book.txt', 'ab')
    for sentence in convert_lines_to_text(str(convert_epub_to_lines(book)),
                                          book):
        file.write(sentence.encode("utf-8"))
    print("Wrote \"%s\" to disk" % (book))
    file.close()
else:
    print("Couldn't open \"%s\"." % (book))
コード例 #23
0
ファイル: epub_to_txt.py プロジェクト: sergeiissaev/NLP_code
###### MODIFY CODE HERE ########
path = "path/to/epubs/directory/"
output = "name_of_output_file.txt"
###### END OF MODIFY CODE HERE ###########

from epub_conversion.utils import open_book, convert_epub_to_lines
import os
import re
from string import digits
cleanr = re.compile('<.*?>')
remove_digits = str.maketrans('', '', digits)
final = []
for filename in os.listdir(path):
    ret = []
    book = open_book(path + filename)
    lines = convert_epub_to_lines(book)
    count = 0
    length = len(lines)
    maxim = length - 300
    for i in range(len(lines)):

        cleantext = re.sub(cleanr, '', lines[i])
        sos = cleantext[:-1].rstrip()
        sos = sos.translate(remove_digits)
        if sos.rstrip() != "" and count > 100 and count < maxim:
            ret.append(cleantext[:-1].lstrip())
        count += 1
    for i in range(len(ret)):
        line = ret[i]
        sos = re.split(r'\W+', line)
        if len(sos) > 2:
コード例 #24
0
ファイル: import_tfl.py プロジェクト: cameronfabbri/MealPrep
def main():

    engine = create_engine('sqlite:///recipes.db')
    Base.metadata.bind = engine
    DBSession = sessionmaker(bind=engine)
    session = DBSession()

    book = open_book('data/books/TheFoodLab.epub')
    lines = convert_epub_to_lines(book)
    with open('f.txt', 'w') as f:
        for l in lines:
            f.write(l + '\n')

    parser = MyHTMLParser()

    in_recipe = False
    got_ing = False
    got_inst = False
    recipe_count = 0

    recipe_dict = {}

    week_count = 0

    lc = 0
    for line in tqdm(lines):

        if 'recipe_rt' in line:
            in_recipe = True
            parser.feed(line)
            recipe_title = parser.data.title()
            ingredients = []
            instructions = []

            try:
                url = BingImages(recipe_title, count=1).get()[0]
            except:
                url = None

        if in_recipe:

            if 'recipe_i' in line:
                parser.feed(line)
                ingredient = parser.data.title()
                ingredients.append(ingredient)
                got_ing = True

            elif 'recipe_rsteps' in line:
                parser.feed(line)
                instruction = parser.data.title()
                instructions.append(instruction)
                got_inst = True

        if lc > 0:
            prev = lines[lc - 1]

        if got_inst and got_ing and 'recipe_rsteps' in prev:
            recipe_count += 1
            got_inst = False
            got_ing = False
            in_recipe = False

            recipe = Recipes(title=recipe_title,
                             ingredients=str(ingredients),
                             instructions=str(instructions),
                             url=url)

            session.add(recipe)

            if week_count < 3:
                week_recipe = Week(id=recipe.id, slot_num=week_count + 1)
                session.add(week_recipe)
                session.commit()
                week_count += 1

        lc += 1

    session.commit()
コード例 #25
0
from epub_conversion.utils import open_book, convert_epub_to_lines
import glob
from tqdm import tqdm

filenames = glob.glob('../raw_files/epub_by_work/*.epub')

for filename in tqdm(filenames):
    book = open_book(filename)
    lines = convert_epub_to_lines(book)
    split_filename = filename.split('/')
    new_filename = f'{split_filename[-1][:-5]}_markup.txt'
    with open(f'../working_files/{new_filename}', 'w') as f:
        for line in lines:
            print(line, file=f)
コード例 #26
0
ファイル: epubToTxt.py プロジェクト: Dombearx/TP_project
from epub_conversion.utils import open_book, convert_epub_to_lines, convert_lines_to_text

# NOT WORKING!!
book = open_book("data/nad-niemnem.epub")

lines = convert_epub_to_lines(book)

print(len(lines))

print(lines[6])

#max = 24
#min = 7
converted = []
for line in lines[7:25]:
    converted.append(convert_lines_to_text(line))

f = open("./data_conv/nad-niemnem.txt", "w")

x = 0
for conv in converted:
    print(x)
    x += 1
    for line in conv:
        f.write(line)
f.close()
コード例 #27
0
ファイル: read.py プロジェクト: felix-martel/stendhal
def _read_epub(path: Union[str, os.PathLike]) -> str:
    book = open_book(path)
    raw = "\n".join(convert_epub_to_lines(book))
    return raw
コード例 #28
0
def convert(name):
    book = open_book(name)
    lines = convert_epub_to_lines(book)
    return lines