Exemple #1
0
def main():
    input = open('./Gratian1.txt', 'r').read()
    input = re.sub('['+string.punctuation+']', '', input)
    input = input.lower()
    lemmatizer = LemmaReplacer('latin')
    lemmata = lemmatizer.lemmatize(input)
    dictionary_1r = {}
    for lemma in lemmata:
        if lemma in dictionary_1r:
            dictionary_1r[lemma] += 1
        else:
            dictionary_1r[lemma] = 1
    # lemmata = dictionary_1r.keys()
    # for lemma in lemmata:
    #     print("%2d\t%s" % (dictionary_1r[lemma], lemma))
    input = open('./Gratian2.txt', 'r').read()
    input = re.sub('['+string.punctuation+']', '', input)
    input = input.lower()
    lemmata = lemmatizer.lemmatize(input)
    dictionary_2r = {}
    for lemma in lemmata:
        if lemma in dictionary_2r:
            dictionary_2r[lemma] += 1
        else:
            dictionary_2r[lemma] = 1
    lemmata = dictionary_2r.keys()
    for lemma in lemmata:
        if lemma not in dictionary_1r:
            print("%2d\t%s" % (dictionary_2r[lemma], lemma))
Exemple #2
0
def lemmanade(lines):

    count = 0
    lemons = []

    # initialize cltk tools
    #jvReplace = JVReplacer()
    wordTokenizer = WordTokenizer('latin')
    lemmatizer = LemmaReplacer('latin')

    for verse in lines:

        count = count + 1

        # lowercase
        #verse = jvReplace.replace(verse.lower())

        #tokenize the words
        chunkTok = wordTokenizer.tokenize(verse.lower())
        chunkTok = [
            whiteTok(tok) for tok in chunkTok if whiteTok(tok) is not None
        ]

        #lemmatize the tokens
        lemmata = lemmatizer.lemmatize(chunkTok)

        #add all the lemmatized tokens together in a string
        lemons.append(lemmata)

    return lemons
Exemple #3
0
def declineEachWordInList(word_list, error_type_context,logger):
    """
    A declension function which might not be needed anymore.
    """
    error_type = error_type_context + "-declension"
    error_count = 0
    total_list_length = len(word_list)

    words_string = ' '.join(word_list)
    normalized_string = normalizeLatinWordsInNonstandardGlyphs(words_string)
    jv_replaced_string = jv_replace(normalized_string)
    word_list = jv_replaced_string.split()
    lemmatizer = LemmaReplacer('latin')
    try:
        word_list = lemmatizer.lemmatize(word_list)
    except:
        print("Lemmatization error with " + word)
        #logger.addToLogger(error_type, "WARNING", "lemmatization failed: " + word)
        error_count += 1

    decliner = CollatinusDecliner()
    declined_forms = []
    for word in word_list:
        try:
            declined = decliner.decline(word, flatten=True)
            declined_forms = declined_forms + declined
        except:
            logger.addToLogger(error_type, "WARNING", "Lemma couldn't be declined: " + word)
            error_count += 1

    print('[' + str(error_count) + ' / ' + str(total_list_length) + '] declension errors')
    return declined_forms
Exemple #4
0
def declineWord(word, error_type_context,logger):
    """
    A declension function which might not be needed anymore.
    """
    error_type = error_type_context + "-declension"

    normalized_string = normalizeLatinWordsInNonstandardGlyphs(word)
    jv_replaced_string = jv_replace(normalized_string)
    lemmatizer = LemmaReplacer('latin')
    try:
        word_list = lemmatizer.lemmatize(word_list)
    except:
        print("Lemmatization error with " + word)
        #logger.addToLogger(error_type, "WARNING", "lemmatization failed: " + word)
        error_count += 1

    decliner = CollatinusDecliner()
    declined_forms = []
    for word in word_list:
        try:
            declined = decliner.decline(word, flatten=True)
            declined_forms = declined_forms + declined
        except:
            logger.addToLogger(error_type, "WARNING", "Lemma couldn't be declined: " + word)
            error_count += 1

    return declined_forms
Exemple #5
0
 def test_lemmatizer_instr_outlemma_latin(self):
     """Test the Latin lemmatizer.
     """
     replacer = LemmaReplacer('latin')
     unlemmatized = 'hominum divomque voluptas'
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=True, return_string=False)
     target = ['hominum/h**o', 'divomque/divus', 'voluptas/voluptas']
     self.assertEqual(lemmatized, target)
Exemple #6
0
 def test_lemmatizer_inlist_outlemma_outstring_latin(self):
     """Test the Latin lemmatizer.
     """
     replacer = LemmaReplacer('latin')
     unlemmatized = ['hominum', 'divomque', 'voluptas']
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=True, return_string=True)
     target = 'hominum/h**o divomque/divus voluptas/voluptas'
     self.assertEqual(lemmatized, target)
Exemple #7
0
 def test_lemmatizer_instr_greek(self):
     """Test the Greek lemmatizer.
     """
     replacer = LemmaReplacer('greek')
     unlemmatized = 'τὴν διάγνωσιν ἔρχεσθαι'
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=False, return_string=False)
     target = ['τὴν', 'διάγνωσις', 'ἔρχομαι']
     self.assertEqual(lemmatized, target)
Exemple #8
0
 def test_lemmatizer_instr_greek(self):
     """Test the Greek lemmatizer.
     """
     replacer = LemmaReplacer('greek')
     unlemmatized = 'τὴν διάγνωσιν ἔρχεσθαι'
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=False, return_string=False)
     target = ['τὴν', 'διάγνωσις', 'ἔρχομαι']
     self.assertEqual(lemmatized, target)
Exemple #9
0
 def test_lemmatizer_inlist_outlemma_outstring_latin(self):
     """Test the Latin lemmatizer.
     """
     replacer = LemmaReplacer('latin')
     unlemmatized = ['hominum', 'divomque', 'voluptas']
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=True, return_string=True)
     target = 'hominum/h**o divomque/divus voluptas/voluptas'
     self.assertEqual(lemmatized, target)
Exemple #10
0
 def test_lemmatizer_instr_outlemma_latin(self):
     """Test the Latin lemmatizer.
     """
     replacer = LemmaReplacer('latin')
     unlemmatized = 'hominum divomque voluptas'
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=True, return_string=False)
     target = ['hominum/h**o', 'divomque/divus', 'voluptas/voluptas']
     self.assertEqual(lemmatized, target)
Exemple #11
0
 def test_lemmatizer_inlist_latin(self):
     """Test the Latin lemmatizer.
     """
     replacer = LemmaReplacer('latin')
     unlemmatized = ['hominum', 'divomque', 'voluptas']
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=False, return_string=False)
     target = ['h**o', 'divus', 'voluptas']
     self.assertEqual(lemmatized, target)
Exemple #12
0
 def test_lemmatizer_instr_outstring_latin(self):
     """Test the Latin lemmatizer.
     """
     replacer = LemmaReplacer('latin')
     unlemmatized = 'hominum divomque voluptas'
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=False, return_string=True)
     target = 'h**o divus voluptas'
     self.assertEqual(lemmatized, target)
Exemple #13
0
 def test_lemmatizer_inlist_latin(self):
     """Test the Latin lemmatizer.
     """
     replacer = LemmaReplacer('latin')
     unlemmatized = ['hominum', 'divomque', 'voluptas']
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=False, return_string=False)
     target = ['h**o', 'divus', 'voluptas']
     self.assertEqual(lemmatized, target)
Exemple #14
0
 def test_lemmatizer_inlist_outlemma_greek(self):
     """Test the Greek lemmatizer.
     """
     replacer = LemmaReplacer('greek')
     unlemmatized = ['τὴν', 'διάγνωσιν', 'ἔρχεσθαι']
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=True, return_string=False)
     target = ['τὴν/τὴν', 'διάγνωσιν/διάγνωσις', 'ἔρχεσθαι/ἔρχομαι']
     self.assertEqual(lemmatized, target)
Exemple #15
0
 def test_lemmatizer_inlist_outstring_greek(self):
     """Test the Greek lemmatizer.
     """
     replacer = LemmaReplacer('greek')
     unlemmatized = ['τὴν', 'διάγνωσιν', 'ἔρχεσθαι']
     lemmatized = replacer.lemmatize(unlemmatized, return_lemma=False, return_string=True)
     target = 'τὴν διάγνωσις ἔρχομαι'
     self.assertEqual(lemmatized, target)
Exemple #16
0
 def test_lemmatizer_inlist_outlemma_greek(self):
     """Test the Greek lemmatizer.
     """
     replacer = LemmaReplacer('greek')
     unlemmatized = ['τὴν', 'διάγνωσιν', 'ἔρχεσθαι']
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=True, return_string=False)
     target = ['τὴν/τὴν', 'διάγνωσιν/διάγνωσις', 'ἔρχεσθαι/ἔρχομαι']
     self.assertEqual(lemmatized, target)
Exemple #17
0
 def test_lemmatizer_instr_outlemma_outstring_greek(self):
     """Test the Greek lemmatizer.
     """
     replacer = LemmaReplacer('greek')
     unlemmatized = 'τὴν διάγνωσιν ἔρχεσθαι'
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=True, return_string=True)
     target = 'τὴν/τὴν διάγνωσιν/διάγνωσις ἔρχεσθαι/ἔρχομαι'
     self.assertEqual(lemmatized, target)
Exemple #18
0
 def test_lemmatizer_instr_outlemma_outstring_greek(self):
     """Test the Greek lemmatizer.
     """
     replacer = LemmaReplacer('greek')
     unlemmatized = 'τὴν διάγνωσιν ἔρχεσθαι'
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=True, return_string=True)
     target = 'τὴν/τὴν διάγνωσιν/διάγνωσις ἔρχεσθαι/ἔρχομαι'
     self.assertEqual(lemmatized, target)
def lemmatize():
    req_data = request.get_json()
    if req_data and req_data['input_text']:
        input_text = req_data['input_text']
        lemmatizer = LemmaReplacer('greek')
        return jsonify(lemmatizer.lemmatize(input_text))

    return jsonify({})
Exemple #20
0
 def test_lemmatizer_instr_outstring_latin(self):
     """Test the Latin lemmatizer.
     """
     replacer = LemmaReplacer('latin')
     unlemmatized = 'hominum divomque voluptas'
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=False, return_string=True)
     target = 'h**o divus voluptas'
     self.assertEqual(lemmatized, target)
Exemple #21
0
def main():
	jv = JVReplacer();
	more = 0
	lemmatizer = LemmaReplacer('latin');
	word_counts = {};
	lines = open(sys.argv[1]);
	for line in lines:
		words = line.split();
		for i in range(0, len(words)):
			words[i] = jv.replace(remove_punctuation(words[i]).lower());
		for word in words:
			#if word in stops_augmented:
			#	continue;
			if "&" in word:
				continue
			if (len(lemmatizer.lemmatize(word)) == 0):
				more += 1
				continue;

			stem = lemmatizer.lemmatize(word)[0];
			if not stem in word_counts:
				word_counts[stem] = 1;
			else:
				word_counts[stem] = word_counts[stem] + 1;
	words_to_show = 400
	sorted_words = sorted(word_counts.items(), key=operator.itemgetter(1), reverse=True);
	top_words = [word[0] for word in sorted_words][0:words_to_show];
	word_freqs = [word[1] for word in sorted_words][0:words_to_show];

	for i in range(0, words_to_show):
		print(str(i) + " " + top_words[i] + " " + str(word_freqs[i]));
	count = 0;
	for i in range(0, words_to_show):
		count += word_freqs[i];
	print(str(count));
	print(str(more))


	s = np.arange(0.0, words_to_show, 1);
	t = word_freqs;
	plt.plot(s, t);
	plt.ylabel('# appearances in Elegiae');
	plt.xlabel('rank of word frequency');
	plt.show();
Exemple #22
0
def main():
    corpus_importer = CorpusImporter('latin')
    corpora_list = corpus_importer.list_corpora
    print(corpora_list)
    corpus_importer.import_corpus('latin_models_cltk')
    sentence = 'Aeneadum genetrix, hominum divomque voluptas, alma Venus, caeli subter labentia signa quae mare navigerum, quae terras frugiferentis concelebras, per te quoniam genus omne animantum concipitur visitque exortum lumina solis.'
    sentence = sentence.lower()
    lemmatizer = LemmaReplacer('latin')
    lemmatized_sentence = lemmatizer.lemmatize(sentence)
    print(lemmatized_sentence)
Exemple #23
0
 def test_lemmatizer_inlist_outstring_greek(self):
     """Test the Greek lemmatizer.
     """
     replacer = LemmaReplacer('greek')
     unlemmatized = ['τὴν', 'διάγνωσιν', 'ἔρχεσθαι']
     lemmatized = replacer.lemmatize(unlemmatized,
                                     return_lemma=False,
                                     return_string=True)
     target = 'τὴν διάγνωσις ἔρχομαι'
     self.assertEqual(lemmatized, target)
Exemple #24
0
def lemmatizeList(list):
    tagger = POSTag('greek')

    lemmatizer = LemmaReplacer('greek')
    lemmWords = lemmatizer.lemmatize(list)

    # Remove Stopwords and numbers and lowercases all words.
    lemmWords = [w.lower() for w in lemmWords if not w in STOPS_LIST]
    lemmWords = removeNumbers(lemmWords)

    return lemmWords
 def lemmatizeLat(self,
                  tokenized_words: list,
                  return_raw: bool = False) -> ([str]) or ([str], [str]):
     """
     Lemmatizes given list of words against the cltk perseus corpus. If second parameter
     is set to true -> returns a list with words BUT additionally with derived "source-word" after
     a "/" seperator.
     :param tokenized_words: String list of words to be lemmatized.
     :param return_raw: Boolean, decides if return should contain raw "source word" or not.
     :return: First index position -> List of lemmmatas; Second index position -_> if second parameter was true
     then list of lemmatas with "source_words" attached to each lemmata string BEFORE the lemmatized word.
     """
     lemmatizer = LemmaReplacer('latin')
     lemmata: [str] = lemmatizer.lemmatize(tokenized_words, False)
     if return_raw:
         lemmata_with_source: [str] = lemmatizer.lemmatize(
             tokenized_words, True)
         return lemmata, lemmata_with_source
     else:
         return lemmata
def get_lemma(input_words, language):
    lang = None

    if language == "Latin":
        lemmatizer = LemmaReplacer("latin")

        # Required for CLTK module
        input_words = latin_lem_replacement(input_words)

    if language == "Greek":
        lemmatizer = LemmaReplacer("greek")

    if type(input_words) == list:
        results = lemmatizer.lemmatize(input_words)
        return results
    else:
        input_words = normalize_word(input_words)
        results = lemmatizer.lemmatize(input_words)
        if len(results) > 0:
            return results[0]
        else:
            return input_words
Exemple #27
0
def lemmatizeWord(word):
    """
    CLTK-based lemmatization function to lemmatize a single word.

    Since CLTK lemmatization always returns a list, it will only return the
    first element of that list. If you want the whole list or lemmatize more
    than one word, use lemmatizeAllWordsFromList.
    This function has no error checking in form of try-catch or anything.
    It's possible that lemmatization fails and thus, the returned string is empty.
    """
    lemmatizer = LemmaReplacer('latin')
    result = lemmatizer.lemmatize(word)
    # always returns list
    return result[0]
Exemple #28
0
def tokenize(text, language="latin"):
    jv_replacer = JVReplacer()
    text = jv_replacer.replace(text.lower())

    t = WordTokenizer(language)
    l = LemmaReplacer(language)

    text_word_tokens = t.tokenize(text)

    # Garde les mots de plus de trois characters
    ## text_word_tokens = [token for token in text_word_tokens if token not in ['.', ',', ':', ';','*']]
    text_word_tokens = [token for token in text_word_tokens if len(token) > 3]

    text_word_tokens = l.lemmatize(text_word_tokens)

    return text_word_tokens
Exemple #29
0
def preprocess(doc):
    assert (type(doc) == str)
    word_tokenizer = WordTokenizer('latin')
    doc_word_tokens = word_tokenizer.tokenize(doc)
    doc_word_tokens_no_punt = [
        token.lower() for token in doc_word_tokens
        if token not in ['.', ',', ':', ';']
    ]

    # lemmeatization
    corpus_importer = CorpusImporter('latin')
    corpus_importer.import_corpus('latin_models_cltk')
    jv_replacer = JVReplacer()

    lemmatizer = LemmaReplacer('latin')
    lemmata = lemmatizer.lemmatize(" ".join(doc_word_tokens_no_punt))
    cleaned = remove_latin_library_items(" ".join(lemmata))
    return cleaned
    def lemmatizeList(self, lines):
        from cltk.corpus.utils.formatter import cltk_normalize

        tagger = POSTag('greek')

        lemmatizer = LemmaReplacer('greek')

        # can help when using certain texts (doc says it, so i does it)
        lines = cltk_normalize(lines)

        # print(lines)
        # exit(0)
        lines = lemmatizer.lemmatize(lines)

        # Remove Stopwords and numbers and lowercases all words.
        lines = [w.lower() for w in lines if not w in STOPS_LIST]
        # lemmWords = removeNumbers(lemmWords)

        return ' '.join(lines)
def runTest(text):
   '''Test cltk tools for latin'''
   print('Test phrase:')
   print(' -> ' + text)
   print()

#   print('[1/3] Testing JVReplacer')
#   jv = JVReplacer()
#   text = jv.replace(text)
#   print(' -> ' + text)
#   print()

   print('[2/3] Testing WordTokenizer')
   tokenizer = WordTokenizer('latin')
   tok = tokenizer.tokenize(text)
   print(' -> ' + ', '.join(["'{}'".format(t) for t in tok]))
   print()

   print('[3/3] Testing LemmaReplacer')
   lemmatizer = LemmaReplacer('latin')
   lem = lemmatizer.lemmatize(tok)
   print(' -> ' + ', '.join(["'{}'".format(l) for l in lem]))
   print()
Exemple #32
0
def get_docs(letters):

    docs = []
    count = 0
    for i, entry in enumerate(letters):
        letter, tag = entry
        NO_PUNCT_RE = re.compile(r'[?!\.\'\"<>():;,]')
        replacer = JVReplacer()
        letter = replacer.replace(letter)
        words = re.sub(NO_PUNCT_RE, '', letter).lower().split()

        for i, word in enumerate(words):
            if word.endswith('-'):
                words[i + 1] = '%s%s' % (word.strip('-'), words[i + 1])
        words = [w for w in words if not w.endswith('-')]
        words = [w for w in words if w not in STOPS_LIST]
        words = ' '.join(words)
        lemmatizer = LemmaReplacer('latin')
        words = lemmatizer.lemmatize(words)
        count += len(words)
        doc = TaggedDocument(words, [tag])
        docs.append(doc)
    return docs
def pre_process(letters):

    pre_processed = []
    for letter in letters:
        NO_PUNCT_RE = re.compile(r'[?!\.\'\"<>():;,]')
        replacer = JVReplacer()
        letter = replacer.replace(letter)
        words = re.sub(NO_PUNCT_RE, '', letter).lower().split()

        for i, word in enumerate(words):
            if word.endswith('-'):
                words[i + 1] = '%s%s' % (word.strip('-'), words[i + 1])
        words = [w for w in words if not w.endswith('-')]
        words = [w for w in words if w not in STOPS_LIST]
        words = ' '.join(words)
        lemmatizer = LemmaReplacer('latin')
        words = lemmatizer.lemmatize(words)
        # very common words that seemed to be cofounding the topic model
        words = [
            w for w in words if w not in ['magnus', 'bonus', 'ago', 'valeo']
        ]
        pre_processed.append(words)
    return pre_processed
def lemmata(text):
    lemmatizer = LemmaReplacer('greek')
    return [word for word in set(lemmatizer.lemmatize(text.lower())) if not word in STOPS_LIST]
Exemple #35
0
def gen_docs(corpus, lemmatize, rm_stops):
    """Open and process files from a corpus. Return a list of sentences for an author. Each sentence
    is itself a list of tokenized words.
    """

    assert corpus in ['phi5', 'tlg']

    if corpus == 'phi5':
        language = 'latin'
        filepaths = assemble_phi5_author_filepaths()
        jv_replacer = JVReplacer()
        text_cleaner = phi5_plaintext_cleanup
        word_tokenizer = nltk_tokenize_words
        if rm_stops:
            stops = latin_stops
        else:
            stops = None
    elif corpus == 'tlg':
        language = 'greek'
        filepaths = assemble_tlg_author_filepaths()
        text_cleaner = tlg_plaintext_cleanup
        word_tokenizer = nltk_tokenize_words

        if rm_stops:
            stops = latin_stops
        else:
            stops = None

    if lemmatize:
        lemmatizer = LemmaReplacer(language)

    sent_tokenizer = TokenizeSentence(language)

    for filepath in filepaths:
        with open(filepath) as f:
            text = f.read()
        # light first-pass cleanup, before sentence tokenization (which relies on punctuation)
        text = text_cleaner(text, rm_punctuation=False, rm_periods=False)
        sent_tokens = sent_tokenizer.tokenize_sentences(text)
        # doc_sentences = []
        for sentence in sent_tokens:
            # a second cleanup at sentence-level, to rm all punctuation
            sentence = text_cleaner(sentence,
                                    rm_punctuation=True,
                                    rm_periods=True)
            sentence = word_tokenizer(sentence)
            sentence = [s.lower() for s in sentence]
            sentence = [w for w in sentence if w]
            if language == 'latin':
                sentence = [
                    w[1:] if w.startswith('-') else w for w in sentence
                ]

            if stops:
                sentence = [w for w in sentence if w not in stops]

            sentence = [w for w in sentence if len(w) > 1]  # rm short words

            if sentence:
                sentence = sentence

            if lemmatize:
                sentence = lemmatizer.lemmatize(sentence)
            if sentence and language == 'latin':
                sentence = [jv_replacer.replace(word) for word in sentence]
            if sentence:
                yield sentence
Exemple #36
0
    from cltk.stem.lemma import LemmaReplacer
    from cltk.tag.pos import POSTag
    lemmatizer = LemmaReplacer('greek')
    tagger = POSTag('greek')
else:
    import spacy
    nlp = spacy.load("en_core_web_trf")  # English

result = []
count = 0
for sentence in data:
    sentence = re.sub(
        r"[\.\?·;]\s*$", '', sentence
    )  # remove sentence-ending punctuation; all other punctuation has already been removed
    if language == 'grc':
        lemmas = lemmatizer.lemmatize(sentence)
        tagged = tagger.tag_tnt(sentence)
        tagged = [[w[1], w[0]] for w in tagged if not cltk_ignored(w[1], w[0])]
        a = []
        i = 0
        for w in lemmas:
            if i >= len(tagged):
                break
            pos = tagged[i][0]
            a.append([
                tagged[i][1], lemmas[i],
                cltk_pos_code_to_pos(pos), f"cltk:{pos}"
            ])  # original, lemma, part of speech, cltk part of speech
            i = i + 1
        if len(lemmas) != len(tagged):
            print(lemmas, "\n", tagged, "\n", len(lemmas), len(tagged), "\n",
Exemple #37
0
def main():
    corpus = tokenize_corpus(load_pliny_corpus())

    replacer = JVReplacer()
    corpus = replacer.replace(corpus)

    NO_PUNCT_RE = re.compile(r'[?!\.\'\"<>():;,]')

    sentences = [
        re.sub(NO_PUNCT_RE, '', sentence).lower()
        for sentence in corpus.split('\n')
    ]

    # lemmatize the words

    lemmatizer = LemmaReplacer('latin')
    words_by_sentence = [lemmatizer.lemmatize(sent) for sent in sentences]
    all_words = [item for sublist in words_by_sentence for item in sublist]
    word_map = {}
    unk_count = 0

    count = [['UNK', -1]]
    count.extend(collections.Counter(all_words).most_common(299))
    for word, _ in count:
        word_map[word] = len(word_map)
    for word in all_words:
        index = word_map.get(word, 0)
        if index == 0:
            unk_count += 1
    count[0][1] = unk_count
    reverse_word_map = dict(zip(word_map.values(), word_map.keys()))

    data = []
    WINDOW_SIZE = 5

    for sentence in words_by_sentence:
        for i, word in enumerate(sentence):
            for nb_word in \
                sentence[max(i - WINDOW_SIZE, 0):min(i + WINDOW_SIZE, (len(sentence) + 1))]:
                if nb_word != word:
                    data.append(
                        [word_map.get(word, 0),
                         word_map.get(nb_word, 0)])
    x_train = []
    y_train = []

    for word in data:
        x_train.append(to_one_hot(word[0], 300))
        y_train.append(to_one_hot(word[1], 300))

    x_train = np.asarray(x_train)
    y_train = np.asarray(y_train)

    x = tf.placeholder(tf.float32, shape=(None, 300))
    y_label = tf.placeholder(tf.float32, shape=(None, 300))

    W1 = tf.Variable(tf.random_normal([300, 5]))
    b1 = tf.Variable(tf.random_normal([5]))

    hidden_representation = tf.add(tf.matmul(x, W1), b1)

    W2 = tf.Variable(tf.random_normal([5, 300]))
    b2 = tf.Variable(tf.random_normal([300]))
    prediction = tf.nn.softmax(tf.add(tf.matmul(hidden_representation, W2),
                                      b2))

    with tf.Session() as sess:
        init = tf.global_variables_initializer()

        sess.run(init)  #make sure you do this!
        # define the loss function:
        cross_entropy_loss = tf.reduce_mean(-tf.reduce_sum(
            y_label * tf.log(prediction), reduction_indices=[1]))
        # define the training step:
        train_step = tf.train.GradientDescentOptimizer(0.1).minimize(
            cross_entropy_loss)
        n_iters = 10000
        # train for n_iter iterations
        for _ in range(n_iters):

            start = np.random.randint(0, 700)
            end = start + 700
            if end > len(x_train) - 1:
                end = len(x_train) - 1
                start = len(x_train) - 701

            sess.run(train_step,
                     feed_dict={
                         x: x_train[start:end],
                         y_label: y_train[start:end]
                     })
            print(
                'loss is : ',
                sess.run(cross_entropy_loss,
                         feed_dict={
                             x: x_train[start:end],
                             y_label: y_train[start:end]
                         }))

        vectors = sess.run(W1 + b1)
        with open('word2int.pickle', 'wb') as fp:
            pickle.dump(word_map, fp)
        with open('int2word.pickle', 'wb') as fp:
            pickle.dump(reverse_word_map, fp)
        with open('vectors.pickle', 'wb') as fp:
            pickle.dump(vectors, fp)
Exemple #38
0
        #circumflex:
        sentences[i] = re.sub(r'ô', r'o', sentences[i])
        sentences[i] = re.sub(r'î', r'i', sentences[i])
        sentences[i] = re.sub(r'â', r'a', sentences[i])
        sentences[i] = re.sub(r'û', r'u', sentences[i])
        sentences[i] = re.sub(r'ê', r'e', sentences[i])
        sentences[i] = re.sub(r'ŷ', r'y', sentences[i])

        # grave accent:

        sentences[i] = re.sub(r'à', r'a', sentences[i])
        sentences[i] = re.sub(r'è', r'e', sentences[i])
        sentences[i] = re.sub(r'æ̀', r'ae', sentences[i])
        sentences[i] = re.sub(r'ì', r'i', sentences[i])
        sentences[i] = re.sub(r'ò', r'o', sentences[i])
        sentences[i] = re.sub(r'ù', r'u', sentences[i])
        sentences[i] = re.sub(r'ỳ', r'y', sentences[i])

        # ligatures:
        sentences[i] = re.sub(r'æ', r'ae', sentences[i])
        sentences[i] = re.sub(r'œ', r'oe', sentences[i])

        lemmatized_sentence = " ".join(lemmatizer.lemmatize(sentences[i]))
        lemmatized_sentence = re.sub(r'\.', r'. ', lemmatized_sentence)
        lemmatized_sentence = re.sub(r'\?', r'? ', lemmatized_sentence)
        lemmatized_sentence = re.sub(r'\!', r'! ', lemmatized_sentence)
        #print("lemmatized sentence:", lemmatized_sentence)

        # Write to output files:
        output.write(lemmatized_sentence)
 def getLemma(self):
   lemmatizer = LemmaReplacer('latin')
   return lemmatizer.lemmatize(self.text)
Exemple #40
0
lemmatizer = LemmaReplacer('latin')
tagger = POSTag('latin')
j = JVReplacer()

# Parse XML

xmldoc = parse('/home/ilbuonme/siti/paolo.monella/ursus/casanatensis.xml')
#xmldoc = parse('/home/ilbuonme/siti/paolo.monella/ursus/shorter_casanatensis.xml')
wordElementList = xmldoc.getElementsByTagName('w')

for w in wordElementList:
        form = w.attributes['ana'].value
        print(form)
        # Parse the inflected word
        try:
            lemmaList = lemmatizer.lemmatize(form.lower())
            lemma = lemmaList[0].replace('v', 'u')
            posList   = tagger.tag_tnt(j.replace(form.lower()))
            pos = posList[0][1]
            w.setAttribute('n', form)
            w.setAttribute('lemma', lemma)
            w.setAttribute('ana', pos)
        except:
            raise

"""
with open('output.xml', 'w') as f:
    f = codecs.lookup("utf-8")[3](f)
    xmldoc.writexml(f, encoding="utf-8")
"""
Exemple #41
0
def lemmatize(form):
    lemmatizer = LemmaReplacer('latin')
    list = lemmatizer.lemmatize(form)
    return json.dumps(list)
Exemple #42
0
    else:
        assert False, "unhandled option"

if not filename:
    sys.exit(0)
    
outfilename = filename + ".CLTK-wlt.txt"
lc = 0

with open(filename, 'r') as f:
    with open(outfilename, 'w') as of:
        for l in f:
            l = l.strip()
            bits = l.split()
            if len(bits) != 3:
                continue
            w = normalize('NFC', bits[0])
            l = normalize('NFC', bits[1])
            t = bits[2]
            lemma = cltk_lemmatiser.lemmatize( w )[0]
            #tag   = cltk_tagger.tag_ngram_123_backoff( w )[0]
            tag   = cltk_tagger.tag_tnt( w )[0]
            # tags are all caps
            # καὶ [('καὶ', 'C--------')]
            # δι’ [('δι', None), ('’', '---------')]
            if '#' in lemma:
                hidx = lemma.find('#')
                lemma = lemma[0:hidx]
            print( w, "\t", lemma, "\t", tag[1], file=of )
            lc += 1
Exemple #43
0
def gen_docs(corpus, lemmatize, rm_stops):
    """Open and process files from a corpus. Return a list of sentences for an author. Each sentence
    is itself a list of tokenized words.
    """

    assert corpus in ['phi5', 'tlg']

    if corpus == 'phi5':
        language = 'latin'
        filepaths = assemble_phi5_author_filepaths()
        jv_replacer = JVReplacer()
        text_cleaner = phi5_plaintext_cleanup
        word_tokenizer = WordTokenizer('latin')
        if rm_stops:
            stops = latin_stops
        else:
            stops = None
    elif corpus == 'tlg':
        language = 'greek'
        filepaths = assemble_tlg_author_filepaths()
        text_cleaner = tlg_plaintext_cleanup
        word_tokenizer = WordTokenizer('greek')

        if rm_stops:
            stops = latin_stops
        else:
            stops = None

    if lemmatize:
        lemmatizer = LemmaReplacer(language)

    sent_tokenizer = TokenizeSentence(language)

    for filepath in filepaths:
        with open(filepath) as f:
            text = f.read()
        # light first-pass cleanup, before sentence tokenization (which relies on punctuation)
        text = text_cleaner(text, rm_punctuation=False, rm_periods=False)
        sent_tokens = sent_tokenizer.tokenize_sentences(text)
        # doc_sentences = []
        for sentence in sent_tokens:
            # a second cleanup at sentence-level, to rm all punctuation
            sentence = text_cleaner(sentence, rm_punctuation=True, rm_periods=True)
            sentence = word_tokenizer(sentence)
            sentence = [s.lower() for s in sentence]
            sentence = [w for w in sentence if w]
            if language == 'latin':
                sentence = [w[1:] if w.startswith('-') else w for w in sentence]

            if stops:
                sentence = [w for w in sentence if w not in stops]

            sentence = [w for w in sentence if len(w) > 1]  # rm short words

            if sentence:
                sentence = sentence

            if lemmatize:
                sentence = lemmatizer.lemmatize(sentence)
            if sentence and language == 'latin':
                sentence = [jv_replacer.replace(word) for word in sentence]
            if sentence:
                yield sentence
Exemple #44
0

lemmatizer = LemmaReplacer('latin')
tagger = POSTag('latin')
j = JVReplacer()

text = []
#text = ['Gallia', 'est', 'omnis', 'divisa', 'in', 'partes', 'tres']
with open('/home/ilbuonme/siti/paolo.monella/ursus/lemma/recycleBin/textForOrig-myCLTK.txt', 'r') as f:
    for x in f.readlines():
        for w in x.split(' '):
            text.append(w)

for t in text:
    if t:
        # Note: the tagger likes 'divisa', while the lemmatizer likes 'diuisa'
        lemmaList = lemmatizer.lemmatize(t.lower())
        posList   = tagger.tag_tnt(j.replace(t.lower()))
        form = posList[0][0]
        lemma = lemmaList[0].replace('v', 'u')
        pos = posList[0][1]
        print('<w n="' + form + '" lemma="'+lemma + '" ana="' + pos + '">')

"""
# Apprently j.replace... makes it worse for the POS tagger, so I'm dropping it in this case
text = j.replace(text.lower())
forms = tagger.tag_unigram(text.lower())
for fm in forms:
    print(fm)
"""
Exemple #45
0
def lemmata(text):
    lemmatizer = LemmaReplacer('greek')
    return [
        word for word in set(lemmatizer.lemmatize(text.lower()))
        if not word in STOPS_LIST
    ]
Exemple #46
0
#!/bin/python3

from cltk.stem.lemma import LemmaReplacer
from cltk.corpus.utils.formatter import cltk_normalize

lemmatizer = LemmaReplacer('greek')

text = """
μῆνιν ἄειδε θεὰ Πηληϊάδεω ̓Αχιλῆος
οὐλομένην, ἣ μυρί' ̓Αχαιοῖς ἄλγε' ἔθηκε,
πολλὰς δ' ἰφθίμους ψυχὰς ̓́Αϊδι προί̈αψεν
ἡρώων, αὐτοὺς δὲ ἑλώρια τεῦχε κύνεσσιν
οἰωνοῖσί τε πᾶσι, Διὸς δ' ἐτελείετο βουλή,
ἐξ οὗ δὴ τὰ πρῶτα διαστήτην ἐρίσαντε
Ατρεί̈δης τε ἄναξ ἀνδρῶν καὶ δῖος ̓Αχιλλεύς.
"""

#print(lemmatizer.lemmatize(text))
print(lemmatizer.lemmatize("Μῆνιν ἄειδε, θεά"))
print(
    lemmatizer.lemmatize(
        cltk_normalize("μῆνιν ἄειδε θεὰ Πηληϊάδεω ̓Αχιλῆος")))
# ... doesn't work without the normalization