예제 #1
0
class LemmatizerLatin:
    def __init__(self, token=True):
        self.lemmatizer = BackoffLatinLemmatizer()
        self.token = token

    def preprocess(self, text):
        if self.token:
            lemma = self.lemmatizer.lemmatize(text)
        else:
            plv = PunktLanguageVars()
            unigrams = plv.word_tokenize(text)
            lemma = self.lemmatizer.lemmatize(unigrams)

        lemma = [t[0] if t[1] == "punc" else t[1] for t in lemma]

        return " ".join(lemma)
예제 #2
0
 def test_backoff_latin_lemmatizer(self):
     """Test backoffLatinLemmatizer"""
     train = [[('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]]  # pylint: disable=line-too-long
     lemmatizer = BackoffLatinLemmatizer(train=train)
     test_str = """Ceterum antequam destinata componam"""
     target = [('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
예제 #3
0
 def test_backoff_latin_lemmatizer_verbose(self):
     """Test backoffLatinLemmatizer"""
     train = [[('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]]  # pylint: disable=line-too-long
     lemmatizer = BackoffLatinLemmatizer(verbose=True)
     test_str = """Ceterum antequam destinata componam"""
     target = [('ceterum', 'ceterum', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('antequam', 'antequam', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('destinata', 'destino', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('componam', 'compono', '<DictLemmatizer: Morpheus Lemmas>')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
예제 #4
0
 def test_backoff_latin_lemmatizer(self):
     """Test backoffLatinLemmatizer"""
     train = [[('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]]  # pylint: disable=line-too-long
     lemmatizer = BackoffLatinLemmatizer(train=train)
     test_str = """Ceterum antequam destinata componam"""
     target = [('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
예제 #5
0
파일: test_lemmatize.py 프로젝트: cltk/cltk
 def test_backoff_latin_lemmatizer_verbose(self):
     """Test backoffLatinLemmatizer"""
     train = [[('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]]  # pylint: disable=line-too-long
     lemmatizer = BackoffLatinLemmatizer(verbose=True)
     test_str = """Ceterum antequam destinata componam"""
     target = [('ceterum', 'ceterum', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('antequam', 'antequam', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('destinata', 'destino', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('componam', 'compono', '<DictLemmatizer: Morpheus Lemmas>')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
예제 #6
0
def latin_lemma_text(list_of_texts, stopwords=None):
    '''
    Create a list of continuous lemma texts for Latin with cltk (prerequisite).
       
    list_of_texts: raw text items stored in a list object
    stopwords: list of stopwords to be removed, default is None where nothing is removed
    
    Latin lemmatizer is cltk's BackoffLatinLemmatizer. Install, import and load before using the function
    '''

    # Import packages and models from cltk and initialize tools
    from cltk.corpus.utils.importer import CorpusImporter
    from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer
    corpus_importer = CorpusImporter(
        'latin')  # Initialize cltk's CorpusImporter
    corpus_importer.import_corpus(
        'latin_models_cltk')  # Import the latin_models_cltk corpus
    lemmatizer = BackoffLatinLemmatizer()  # Initialize Latin lemmatizer

    import re
    punctuation = r"[\"#$%&\'()*+,-/:;<=>@[\]^_`{|}~.?!«»]"  # Punctuation pattern
    a = []
    for i in range(len(list_of_texts)):
        text = str(list_of_texts[i])
        new_text = ''.join([
            "" if ord(i) < 32 or ord(i) > 126 else i for i in text
        ])  # Remove Greek (non-ASCII) characters
        text_no_punct = re.sub(punctuation, '', new_text)  # Remove punctuation
        text_one_white_space = re.sub(
            r"\s{2,}", ' ',
            text_no_punct)  # Leave only one white space b/w words
        text_no_trailing_space = text_one_white_space.strip(
        )  # Remove trailing white space
        text_lower = text_no_trailing_space.lower(
        )  # Transform to all lower case
        text_split = text_lower.split(' ')  # Split to a list of tokens
        lemmas = lemmatizer.lemmatize(text_split)  # Lemmatize
        textunit = ''  # Empyt string for textunti
        for y in range(len(lemmas)):
            if stopwords is not None:
                if lemmas[y][1] not in stopwords:
                    textunit = textunit + str(lemmas[y][1] + ' ')
            else:
                textunit = textunit + str(lemmas[y][1] + ' ')
        textunit = textunit.strip()
        a.append(textunit)  # Add the "document" to a list
    return a
예제 #7
0
    line = re.sub('<[^<]+>', '', line)
    line = line.translate(str.maketrans('', '', string.punctuation))
    line = line.replace('\n', '')
    line = line.replace('-', '')
    line = line.replace('—', '')
    line = line.replace('“', '')
    line = line.replace('”', '')
    line = line.lower()
    return line.split()

with open(args.document, 'r', encoding='utf8') as f:
    for line in f:
        if (L_OPEN in line and L_CLOSE in line) or LB_OPEN in line:
            tokens = tokenize(line)
            if tokens != []:
                lemmas = lemmatizer.lemmatize(tokens)
                lemmas = list(zip(*lemmas))[1]
                lemma_str = "<BOS>" + " " + ' '.join(lemmas) + " " + "<EOS>" + " "
                line_str = "<BOS>" + " " +  line + " " + "<EOS>" + " "

                lemmas_result.append(lemma_str)
                tokens_result.append(line_str)

doc_start = len(os.listdir(S_DATA_PATH + 'lemmas/'))
print(doc_start)
for i in range(0, len(lemmas_result), C_SIZE):
    doc_name = str(doc_start + i // C_SIZE) + ".txt"
    with open(S_DATA_PATH + "lemmas/section" + doc_name, 'w') as out_file:
        out_file.writelines(lemmas_result[i:i + C_SIZE])
    with open(S_DATA_PATH + "tokens/section" + doc_name, 'w') as out_file:
        out_file.writelines(tokens_result[i:i + C_SIZE])
lemmatizer = BackoffLatinLemmatizer()
ltr_str = ''

file = open('corpora/all.txt', 'r')
for line in file:
    ltr_str += str(line)
file.close()

np_str = np.asarray(ltr_str)

for symbol in string.punctuation:
    np_str = np.char.replace(np_str, symbol, '')

np_str = np.char.lower(np_str)
tokens = np_str.tolist().split()
lemmatized = lemmatizer.lemmatize(tokens)
with open('corpora/all_lemmatized.txt', 'w') as lemmata:
    for parsed in lemmatized:
        lemmata.write(parsed[1] + ' ')
#print('all lemmata written successfully :)')

#make list of heroides filnames
her_filnames = []
for filename in filenames:
    if filename[5:8] == 'met':
        her_filnames.append(str(filename))

#add their text to corpora/her.txt
with open('corpora/met.txt', 'w') as outfile:
    for fname in her_filnames:
        with open('ovid/' + fname, 'r') as infile:
예제 #9
0
from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer

crimefile = open("passio.txt", 'r')
lemmatized = open("lemmatized.txt", 'w')

tokens = []
for line in crimefile:
    tokens = line.split()
lemmatizer = BackoffLatinLemmatizer()
out = lemmatizer.lemmatize(tokens)

for word in out:
    string = word[1] + "\n"
    lemmatized.write(string)
예제 #10
0
# Pre-process, tokenize, lemmatize text of thematic sections
docs = []
for i in range(len(s_df.Section_id.unique())):
    text = str(list(s_df.TextUnit[
        s_df.Section_id == i]))  # Load all text units from a thematic unit
    new_text = ''.join(
        ["" if ord(i) < 32 or ord(i) > 126 else i for i in text])
    # Remove Greek (non-ASCII) characters
    text_no_punct = re.sub(punctuation, '', new_text)  # Remove punctuation
    text_one_white_space = re.sub(
        r"\s{2,}", ' ', text_no_punct)  # Leave only one white space b/w words
    text_no_trailing_space = text_one_white_space.strip(
    )  # Remove trailing white space
    text_lower = text_no_trailing_space.lower()  # Transform to all lower case
    text_split = text_lower.split(' ')  # Split to a list of tokens
    lemmas = lemmatizer.lemmatize(text_split)  # Lemmatize
    l = []  # Create empty list for lemmas
    for y in range(len(lemmas)):
        if lemmas[y][1] not in D_stoplist:
            l.append(lemmas[y][1])  # Load the lemma to the list
    l_string = ' '.join([str(word)
                         for word in l])  # Drop stopwords and create a string
    docs.append(l_string)  # Add the "document" to a list
sID['doc'] = docs  # Insert stopword-free lemma list into a new column

# Pre-process, tokenize and lemmatize section titles
section_titles = []
for x in sID.Section_id:
    text = sID.loc[sID.Section_id == x,
                   'Section_title'].values[0]  # Load upper-case section titles
    new_text = ''.join(