Python get_textの例、OpenDocx.get_text Pythonの例

コード例 #1

0

ファイルを表示

def create_tokeniser_test_training():
    """Splits the glosses into two lists, one for training a character-level LSTM based tokeniser, and another
       containing 41 pre-selected glosses as a test set.
       The test set is split into two further lists, the first list is the untokenised glosses of the test set, the
       second is the same glosses, manually tokenised.
       Saves each list as a pickle file."""
    glist = list_numbered_glosses("Wurzburg Glosses", 499, 712)
    # List numbers of all chosen test-set glosses in order
    testglossids = [
        "2c4.", "5b11.", "5b28.", "6c7.", "6c9.", "9a14.", "9b4.", "9c20.",
        "10b27.", "10c21.", "10d23.", "10d36.", "11a24.", "12a22.", "12c9.",
        "12c29.", "12c32.", "12c36.", "14a8.", "14c2a.", "14c18.", "14c23.",
        "14d17.", "14d26.", "15a18.", "16d8.", "17d27.", "18a14.", "18c6.",
        "19b6.", "21a8.", "21c19.", "23b7.", "23d10.", "26b6.", "27a24.",
        "28c2.", "28d16.", "29d19.", "30b4.", "31c7."
    ]
    # Takes Manually Tokenised test-set glosses from file and adds them to a dictionary so they can be found using the
    # list above as keys
    man_tok_glosslist = get_text("Manually Tokenised Glosses").split("\n")
    mtgidpat = re.compile(r'\(\d{1,2}\w \d{1,2}\w?\) ')
    mtgs_with_ids = {}
    for mtg in man_tok_glosslist:
        mtgpatitir = mtgidpat.finditer(mtg)
        for mtgiditir in mtgpatitir:
            mtgloss = "".join(mtg.split(mtgiditir.group()))
            mtgid = "".join(mtg.split(mtgloss))
            mtgid_fix = "".join(mtgid.split(" ")) + "."
            mtgid_fix = "".join(mtgid_fix.split("("))
            mtgid_fix = "".join(mtgid_fix.split(")"))
            mtgs_with_ids[mtgid_fix] = mtgloss
    # Creates the test and training lists
    testglosses = []
    testglosses_tokenised = []
    trainglosses = []
    for g in glist:
        if g[0] in testglossids:
            testglosses.append(g[1])
            testglosses_tokenised.append(mtgs_with_ids.get(g[0]))
        else:
            trainglosses.append(g[1])
    # Combines the untokenised and tokenised test-set
    testglosses_set = [testglosses, testglosses_tokenised]
    # Saves the test and train sets to pickle files
    pickletest_out = open("toktest.pkl", "wb")
    pickle.dump(testglosses_set, pickletest_out)
    pickletest_out.close()
    pickletrain_out = open("toktrain.pkl", "wb")
    pickle.dump(trainglosses, pickletrain_out)
    pickletrain_out.close()
    return "\nTest and Training Sets Compiled for Gloss Tokenisation.\n"

コード例 #2

0

ファイルを表示

def opchsave(filename):
    """Opens a document, allows you to edit it somehow, saves a copy of the document."""
    # Open Document and get text
    text = get_text(filename)
    # Change Document somehow
    # Here instances of "...[a][/GLat]" are being changed to "...[/GLat][a]"
    glatpat = re.compile(r'\[\w\]\[/GLat\]')
    glatpatitir = glatpat.finditer(text)
    swaplist = []
    for i in glatpatitir:
        if i.group() not in swaplist:
            swaplist.append(i.group())
    for error in swaplist:
        letter = error[:3]
        fix = "[/GLat]" + letter
        textlist = text.split(error)
        text = fix.join(textlist)
    # Save a copy of the updated Document
    save_docx(text, filename)
    return "Completed!"

コード例 #3

0

ファイルを表示

ファイル: OpenPages.py プロジェクト: berstearns/WurzburgGlossParser

def get_pages(filename, startpage=499, endpage=712):
    """Opens the text of the Glosses from the document and returns the text of the selected page range"""
    alltext = get_text(filename)
    pagestext = []
    lastpage = 0
    nextpagepoint = 0
    startpoint = 0
    if startpage < 499:
        startpage = 499
    if endpage > 712:
        endpage = 712
    if startpage > 712:
        startpage = 499
    if endpage < 499:
        endpage = 712
    if startpage == 499 and endpage == 712:
        return alltext
    elif startpage > 499:
        alltext = alltext[find_page(alltext, startpage):]
    for page in range(startpage, endpage + 1):
        if lastpage == 0:
            pageno = startpage
        else:
            pageno = lastpage + 1
            startpoint += nextpagepoint
        if pageno == 712:
            pagetext = alltext[startpoint:]
        else:
            nextpage = pageno + 1
            pagetext = alltext[startpoint:]
            nextpagepoint = pagetext.find(str(nextpage))
            pagetext = pagetext[:nextpagepoint]
            nextpagepoint = pagetext.rfind("\n")
            pagetext = pagetext[:nextpagepoint]
        pagestext.append(pagetext)
        lastpage = pageno
    pagestext = "".join(pagestext)
    return pagestext

コード例 #4

0

ファイルを表示

ファイル: LSTMTokenise_OneText.py プロジェクト: berstearns/WurzburgGlossParser

# # Import the training set of glosses for use as the single training text
# one_text_in = open("toktrain.pkl", "rb")
# one_text = " ".join(pickle.load(one_text_in))
# text_name = "Wb. Training Glosses"

# Import and clean CELT texts for use as the single training text
clean_text_list = []
all_clean_files = [
    f for f in listdir("CELT_Texts_Clean")
    if op.isfile(op.join("CELT_Texts_Clean", f))
]
for cf in all_clean_files:
    cf = "".join(cf.split(".docx"))
    if cf != ".DS_Store":
        cf = op.join("CELT_Texts_Clean", cf)
        clean_text_list.append(get_text(cf))
one_text = " ".join(clean_text_list)
one_text = " ".join(one_text.split("\n"))
while "  " in one_text:
    one_text = " ".join(one_text.split("  "))
text_name = "CELT Collection"

# Import test and train sets for character mapping
train_in = open("toktrain.pkl", "rb")
train_set = pickle.load(train_in)
test_in = open("toktest.pkl", "rb")
test_set = pickle.load(test_in)
x_train = remove_non_glosses(train_set)
# temp = []  # Reverse x_train for reverse models
# for x_trainer in x_train[::-1]:  # Reverse x_train for reverse models
#     new_trainer = x_trainer[::-1]  # Reverse x_train for reverse models

コード例 #5

0

ファイルを表示

def cleantext_CELT(filename):
    """Opens docx file of text as copied from CELT,
       removes line numbers and punctuation,
       saves the updated document."""
    # Open Document and get text
    text = get_text(filename)
    linelist = text.split("\n")
    # Here lines are stripped of white space at the end
    textlist = []
    for line in linelist:
        line = line.strip()
        textlist.append(line)
    text = "\n".join(textlist)
    # Here irregular intrusions into texts are removed
    removes = [
        "[LU1]", "[LU2]", "\nf. L.", 'L. f.', '.r.', '.C.', '.u.', ' m.',
        " c.", " e.", " R.", " U.", ".ix.", ".x.", ".xx.", ".xxx.", ".u."
    ]
    for rem in removes:
        text = "".join(text.split(rem))
    # Here punctuated items are replaced so that they are not changed when punctuation is removed
    rep_list = [".i.", ".l.", "rl."]
    for replacer in rep_list:
        rep_str = "***".join(replacer.split("."))
        text = rep_str.join(text.split(replacer))
    # Here hyphenated items are replaced appropriately
    post_hyph = ["h-", "m-", "n-", "l-", "t-", "s-", "c-", "r-"]
    for hyph_item in post_hyph:
        hyphpat = re.compile(r'[ ‘\n]' + hyph_item)
        hyphpatitir = hyphpat.finditer(text)
        for hyphpatitem in hyphpatitir:
            thishyphitem = hyphpatitem.group()
            hyph_gone = "".join(thishyphitem.split("-"))
            text = hyph_gone.join(text.split(thishyphitem))
    text = " ".join(text.split(""))
    text = " ".join(text.split("-"))
    # Here the letter v is replaced with u wherever used
    text = "u".join(text.split("v"))
    text = "u".join(text.split("V"))
    # Here apostrophes are removed where they represent a split word
    apostlist = ["'s", "'S", "m' ", "d' ", "th' ", "t' ", "T' "]
    for apost in apostlist:
        apost_gone = "".join(apost.split("'"))
        text = apost_gone.join(text.split(apost))
    # Here line numbers are removed
    remnumlist = []
    linopat = re.compile(r'\n\d{1,4}\] ?')
    linopatitir = linopat.finditer(text)
    for i in linopatitir:
        num = i.group()
        remnumlist.append(num)
    for i in remnumlist[::-1]:
        text = "\n".join(text.split(i))
    # Here page numbers are removed
    pnumlist = []
    pnopat = re.compile(r'p\.\d{1,3}')
    pnopatitir = pnopat.finditer(text)
    for i in pnopatitir:
        pnum = i.group()
        pnumlist.append(pnum)
    for i in pnumlist[::-1]:
        text = "".join(text.split(i))
    # Here folio information is removed
    follist = []
    folpat = re.compile(r'-?{.+}')
    folpatitir = folpat.finditer(text)
    for j in folpatitir:
        fol = j.group()
        follist.append(fol)
    for j in follist:
        text = "".join(text.split(j))
    # Here & and 'et' are replaced with ⁊
    text = "⁊".join(text.split("&"))
    text = " ⁊ ".join(text.split(" et "))
    # Here punctuation is removed
    punclist = [
        '!', ',', '.', ':', ';', '?', '"', "'", '‘', '’', '[', ']', '(', ')',
        '|', '/', '—', '_'
    ]
    for punc in punclist:
        text = "".join(text.split(punc))
    # Here punctuated items are reinserted into the text where they were replaced
    reinst_list = ["***i***", "***l***", "rl***"]
    for reinstater in reinst_list:
        reinst_str = ".".join(reinstater.split("***"))
        text = reinst_str.join(text.split(reinstater))
    # Here double spacing and triple line spacing are removed
    while "\n\n\n" in text:
        text = "\n\n".join(text.split("\n\n\n"))
    while "  " in text:
        text = " ".join(text.split("  "))
    while "\n " in text:
        text = "\n".join(text.split("\n "))
    # Here text is stripped and lower-cased
    text = text.lower()
    text = text.strip()
    # Save a copy of the updated Document
    save_docx(text, op.join(clean_dir,
                            filename[len(raw_dir) + 1:] + "_cleaned"))
    return "Complete!"

コード例 #6

0

ファイルを表示

ファイル: LSTMTokenise_Auto.py プロジェクト: berstearns/WurzburgGlossParser

                # Save Model
                model.save(NAME)
                print("Model {} saved".format(NAME))


"""Parameters Input:"""


# # Choose and name text to train on

# text_name = "Wb. Training Glosses"
# text_designation = "Wb"
# one_text = [" ".join(pickle.load(open("toktrain.pkl", "rb")))]
text_name = "Táin Bó Fraích"
text_designation = "TBF"
one_text = [rem_dubspace(" ".join((get_text("TBF_cleaned")).split("\n")))]


# # Map all test and training characters

mappings = map_chars(load_data(one_text, text_name))
chardict, rchardict, vocab_size = mappings[0], mappings[1], mappings[2]


# # Save the mapping

# pickle.dump(chardict, open('char_mappingTBF.pkl', 'wb'))  # Name mapping


# # Set how many characters the model should look at before predicting an upcoming character

コード例 #7

0

ファイルを表示

def openhandlists(file):
    """Gets the text from a gloss-hand file"""
    filetext = get_text(file)
    return filetext