예제 #1
0
 def test_backoff_latin_lemmatizer(self):
     """Test backoffLatinLemmatizer"""
     train = [[('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]]  # pylint: disable=line-too-long
     lemmatizer = BackoffLatinLemmatizer(train=train)
     test_str = """Ceterum antequam destinata componam"""
     target = [('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
예제 #2
0
파일: test_lemmatize.py 프로젝트: cltk/cltk
 def test_backoff_latin_lemmatizer_verbose(self):
     """Test backoffLatinLemmatizer"""
     train = [[('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]]  # pylint: disable=line-too-long
     lemmatizer = BackoffLatinLemmatizer(verbose=True)
     test_str = """Ceterum antequam destinata componam"""
     target = [('ceterum', 'ceterum', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('antequam', 'antequam', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('destinata', 'destino', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('componam', 'compono', '<DictLemmatizer: Morpheus Lemmas>')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
예제 #3
0
 def test_backoff_latin_lemmatizer(self):
     """Test backoffLatinLemmatizer"""
     train = [[('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]]  # pylint: disable=line-too-long
     lemmatizer = BackoffLatinLemmatizer(train=train)
     test_str = """Ceterum antequam destinata componam"""
     target = [('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
예제 #4
0
 def test_backoff_latin_lemmatizer_verbose(self):
     """Test backoffLatinLemmatizer"""
     train = [[('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]]  # pylint: disable=line-too-long
     lemmatizer = BackoffLatinLemmatizer(verbose=True)
     test_str = """Ceterum antequam destinata componam"""
     target = [('ceterum', 'ceterum', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('antequam', 'antequam', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('destinata', 'destino', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('componam', 'compono', '<DictLemmatizer: Morpheus Lemmas>')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
예제 #5
0
class LemmatizerLatin:
    def __init__(self, token=True):
        self.lemmatizer = BackoffLatinLemmatizer()
        self.token = token

    def preprocess(self, text):
        if self.token:
            lemma = self.lemmatizer.lemmatize(text)
        else:
            plv = PunktLanguageVars()
            unigrams = plv.word_tokenize(text)
            lemma = self.lemmatizer.lemmatize(unigrams)

        lemma = [t[0] if t[1] == "punc" else t[1] for t in lemma]

        return " ".join(lemma)
예제 #6
0
def latin_lemma_text(list_of_texts, stopwords=None):
    '''
    Create a list of continuous lemma texts for Latin with cltk (prerequisite).
       
    list_of_texts: raw text items stored in a list object
    stopwords: list of stopwords to be removed, default is None where nothing is removed
    
    Latin lemmatizer is cltk's BackoffLatinLemmatizer. Install, import and load before using the function
    '''

    # Import packages and models from cltk and initialize tools
    from cltk.corpus.utils.importer import CorpusImporter
    from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer
    corpus_importer = CorpusImporter(
        'latin')  # Initialize cltk's CorpusImporter
    corpus_importer.import_corpus(
        'latin_models_cltk')  # Import the latin_models_cltk corpus
    lemmatizer = BackoffLatinLemmatizer()  # Initialize Latin lemmatizer

    import re
    punctuation = r"[\"#$%&\'()*+,-/:;<=>@[\]^_`{|}~.?!«»]"  # Punctuation pattern
    a = []
    for i in range(len(list_of_texts)):
        text = str(list_of_texts[i])
        new_text = ''.join([
            "" if ord(i) < 32 or ord(i) > 126 else i for i in text
        ])  # Remove Greek (non-ASCII) characters
        text_no_punct = re.sub(punctuation, '', new_text)  # Remove punctuation
        text_one_white_space = re.sub(
            r"\s{2,}", ' ',
            text_no_punct)  # Leave only one white space b/w words
        text_no_trailing_space = text_one_white_space.strip(
        )  # Remove trailing white space
        text_lower = text_no_trailing_space.lower(
        )  # Transform to all lower case
        text_split = text_lower.split(' ')  # Split to a list of tokens
        lemmas = lemmatizer.lemmatize(text_split)  # Lemmatize
        textunit = ''  # Empyt string for textunti
        for y in range(len(lemmas)):
            if stopwords is not None:
                if lemmas[y][1] not in stopwords:
                    textunit = textunit + str(lemmas[y][1] + ' ')
            else:
                textunit = textunit + str(lemmas[y][1] + ' ')
        textunit = textunit.strip()
        a.append(textunit)  # Add the "document" to a list
    return a
예제 #7
0
def stage4(tokens):
    # lemmatize() returns list of tuples
    results = BackoffLatinLemmatizer().lemmatize(tokens)
    lemmas = []  # unique lemmas
    for result in results:
        lemma = result[1]
        if lemma not in lemmas:
            lemmas.append(lemma)
    return lemmas
예제 #8
0
def process(text):
    tokens = WordTokenizer('latin').tokenize(text)
    # lemmatize() returns list of tuples
    results = BackoffLatinLemmatizer().lemmatize(tokens)
    lemmas = []  # unique lemmas
    for result in results:
        lemma = result[1]
        if lemma not in lemmas:
            lemmas.append(lemma)
    return lemmas
예제 #9
0
def tokenize(request):
    language = request['Content-Language']
    src_data = request['Payload']
    print(language)

    word_tokenizer = WordTokenizer(language)
    data = word_tokenizer.tokenize(src_data)
    clean_data = list(map(cltk_normalize, [w for w in data if w.isalpha()]))
    # and not w in STOPS_LIST]

    # lemma = LemmaReplacer(language).lemmatize(clean_data)
    lemma = None
    if language == 'greek':
        lemma = BackoffGreekLemmatizer().lemmatize(clean_data)
    elif language == 'latin':
        lemma = BackoffLatinLemmatizer().lemmatize(clean_data)

    result = []
    for i, elem in enumerate(lemma):
        w, l = elem
        result.append({'index': i + 1, 'word': w, 'lemma': l})

    return result
예제 #10
0
 def __init__(self, token=True):
     self.lemmatizer = BackoffLatinLemmatizer()
     self.token = token
예제 #11
0
import os, os.path

from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer

L_OPEN = '<l'
L_CLOSE = '</l>'
LB_OPEN = '<lb'

S_DATA_PATH = 'data/structured/'
C_SIZE = 150

parser = argparse.ArgumentParser(description='Parse unstructered documents.')
parser.add_argument('--document', required=True, help='Path to document to parse.')
args = parser.parse_args()

lemmatizer = BackoffLatinLemmatizer()
lemmas_result, tokens_result = [], []

def tokenize(line):
    line = re.sub('<note[^<]+note>', '', line)
    line = re.sub('<[^<]+>', '', line)
    line = line.translate(str.maketrans('', '', string.punctuation))
    line = line.replace('\n', '')
    line = line.replace('-', '')
    line = line.replace('—', '')
    line = line.replace('“', '')
    line = line.replace('”', '')
    line = line.lower()
    return line.split()

with open(args.document, 'r', encoding='utf8') as f:
import polyglot
from polyglot.downloader import downloader
from polyglot.text import Text
import statistics
import math
import openpyxl
from openpyxl import Workbook
from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer
from cltk.corpus.utils.importer import CorpusImporter
from cltk.stem.latin.j_v import JVReplacer

corpus_importer = CorpusImporter('latin')
corpus_importer.import_corpus('latin_models_cltk')
lemmatizer = BackoffLatinLemmatizer()
j = JVReplacer()


def lemmatize(text):
    text = j.replace(text)
    tokens = [token for token in text.split()]
    lemmatized = lemmatizer.lemmatize(tokens)
    lemmatized_text = " ".join([token[1] for token in lemmatized])
    return lemmatized_text


def motets_ordered_by_difference(motets):
    motets.sort(key=lambda x: x.sentiment_difference())
    book = Workbook()
    sheet = book.active
    sheet['A1'] = "Title"
    sheet['B1'] = "Composer"
from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer
import os

#first, build list of filenames
filenames = []
for filename in os.listdir('ovid'):
    filenames.append(str(filename))

#then, concatenate them into one text file https://stackoverflow.com/questions/13613336/python-concatenate-text-files
with open('corpora/all.txt', 'w') as outfile:
    for fname in filenames:
        with open('ovid/' + fname, 'r') as infile:
            for line in infile:
                outfile.write(line)

lemmatizer = BackoffLatinLemmatizer()
ltr_str = ''

file = open('corpora/all.txt', 'r')
for line in file:
    ltr_str += str(line)
file.close()

np_str = np.asarray(ltr_str)

for symbol in string.punctuation:
    np_str = np.char.replace(np_str, symbol, '')

np_str = np.char.lower(np_str)
tokens = np_str.tolist().split()
lemmatized = lemmatizer.lemmatize(tokens)
예제 #14
0
# Import basic packages
import pandas as pd
import re
import numpy as np

# Import packages and models from cltk and initialize tools
from cltk.corpus.utils.importer import CorpusImporter
from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer

corpus_importer = CorpusImporter('latin')  # Initialize cltk's CorpusImporter
corpus_importer.import_corpus(
    'latin_models_cltk')  # Import the latin_models_cltk corpus
lemmatizer = BackoffLatinLemmatizer()  # Initialize Latin lemmatizer
from cltk.stem.latin.j_v import JVReplacer

# Import and initialize TfidfVecotirizer with custom stoplist
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

# Load dataframes
file_path_df = '/home/mribary/Dropbox/pyDigest/dump/Ddf_v106.csv'
file_path_s = '/home/mribary/Dropbox/pyDigest/dump/Ddf_sections_v001.csv'
file_path_sID = '/home/mribary/Dropbox/pyDigest/dump/Ddf_Section_IDs_v001.csv'
file_path_stoplist = '/home/mribary/Dropbox/pyDigest/dump/D_stoplist_001.txt'
df = pd.read_csv(file_path_df, index_col=0)  # text units (21055)
s = pd.read_csv(file_path_s,
                index_col=0)  # text unitts with section IDs (21055)
sID = pd.read_csv(file_path_sID,
                  index_col=0)  # sections with section IDs (432)
D_stoplist = list(pd.read_csv(file_path_stoplist,
예제 #15
0
 def test_backoff_latin_lemmatizer_evaluate(self):
     """Test backoffLatinLemmatizer evaluate method"""
     lemmatizer = BackoffLatinLemmatizer(verbose=False)
     accuracy = lemmatizer.evaluate()
     self.assertTrue(.85 <= accuracy <= 1)
예제 #16
0
path = os.path.expanduser(rel_path)

# Check for presence of latin_pos_lemmatized_sents
file = 'latin_pos_lemmatized_sents.pickle'

latin_pos_lemmatized_sents_path = os.path.join(path, file)
if os.path.isfile(latin_pos_lemmatized_sents_path):
    latin_pos_lemmatized_sents = open_pickle(latin_pos_lemmatized_sents_path)
else:
    latin_pos_lemmatized_sents = []
    print('The file %s is not available in cltk_data' % file)

# Set up CLTK tools

word_tokenizer = WordTokenizer('latin')
lemmatizer = BackoffLatinLemmatizer(latin_pos_lemmatized_sents)

# Get tokens

tokens = clean_and_write.vocab_set('clean_texts/ov_met_6_clean.txt')
print(tokens)

# Get lemmas

lemmas = lemmatizer.lemmatize(tokens)
print(lemmas)

lemmata = []
for (x, y) in lemmas:
    lemmata.append(y)
print(sorted(set(lemmata)))
예제 #17
0
from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer

crimefile = open("passio.txt", 'r')
lemmatized = open("lemmatized.txt", 'w')

tokens = []
for line in crimefile:
    tokens = line.split()
lemmatizer = BackoffLatinLemmatizer()
out = lemmatizer.lemmatize(tokens)

for word in out:
    string = word[1] + "\n"
    lemmatized.write(string)
예제 #18
0
파일: test_lemmatize.py 프로젝트: cltk/cltk
 def test_backoff_latin_lemmatizer_evaluate(self):
     """Test backoffLatinLemmatizer evaluate method"""
     lemmatizer = BackoffLatinLemmatizer(verbose=False)
     accuracy = lemmatizer.evaluate()
     self.assertTrue(.85 <= accuracy <= 1)
예제 #19
0
 def test_backoff_latin_lemmatizer_evaluate_verbose(self):
     """Test backoffLatinLemmatizer evaluate method"""
     lemmatizer = BackoffLatinLemmatizer(verbose=True)
     with self.assertRaises(AssertionError):
         accuracy = lemmatizer.evaluate()
예제 #20
0
파일: test_lemmatize.py 프로젝트: cltk/cltk
 def test_backoff_latin_lemmatizer_evaluate_verbose(self):
     """Test backoffLatinLemmatizer evaluate method"""
     lemmatizer = BackoffLatinLemmatizer(verbose=True)
     with self.assertRaises(AssertionError):
         accuracy = lemmatizer.evaluate()
예제 #21
0
# la_lemmatizer = LemmaReplacer('latin')


# Latin Lemmatizer (NEW with backoff)
# Set up training sentences
rel_path = os.path.join('/Users/christiancasey/cltk_data/latin/model/latin_models_cltk/lemmata/backoff')
path = os.path.expanduser(rel_path)
# Check for presence of latin_pos_lemmatized_sents
file = 'latin_pos_lemmatized_sents.pickle'
latin_pos_lemmatized_sents_path = os.path.join(path, file)
if os.path.isfile(latin_pos_lemmatized_sents_path):
    latin_pos_lemmatized_sents = open_pickle(latin_pos_lemmatized_sents_path)
else:
    latin_pos_lemmatized_sents = []
    print('The file %s is not available in cltk_data' % file)
la_lemmatizer = BackoffLatinLemmatizer(latin_pos_lemmatized_sents)

# Greek Lemmatizer
grc_corpus_importer = CorpusImporter('greek')
grc_corpus_importer.import_corpus('greek_models_cltk')
grc_lemmatizer = LemmaReplacer('greek')

# Initialize lemmatizers once outside of the loop,
# then select based on langauge inside the loop -- get_words_from_file()
tagLat = POSTag('latin')
tagGrk = POSTag('greek')


def lemmatize(word_list, copy):
	for word in word_list:
		if copy:
예제 #22
0
 def test_backoff_latin_lemmatizer_models_not_present(self):
     """Test whether models are present for BackoffLatinLemmatizer"""
     with patch.object(BackoffLatinLemmatizer, 'models_path', ''):
         with self.assertRaises(FileNotFoundError):
             lemmatizer = BackoffLatinLemmatizer()