Python LemmaReplacer.lemmatize Exemples, cltk.stem.lemma.LemmaReplacer.lemmatize Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : merged.py Projet : decretist/Sand

def main():
    input = open('./Gratian1.txt', 'r').read()
    input = re.sub('['+string.punctuation+']', '', input)
    input = input.lower()
    lemmatizer = LemmaReplacer('latin')
    lemmata = lemmatizer.lemmatize(input)
    dictionary_1r = {}
    for lemma in lemmata:
        if lemma in dictionary_1r:
            dictionary_1r[lemma] += 1
        else:
            dictionary_1r[lemma] = 1
    # lemmata = dictionary_1r.keys()
    # for lemma in lemmata:
    #     print("%2d\t%s" % (dictionary_1r[lemma], lemma))
    input = open('./Gratian2.txt', 'r').read()
    input = re.sub('['+string.punctuation+']', '', input)
    input = input.lower()
    lemmata = lemmatizer.lemmatize(input)
    dictionary_2r = {}
    for lemma in lemmata:
        if lemma in dictionary_2r:
            dictionary_2r[lemma] += 1
        else:
            dictionary_2r[lemma] = 1
    lemmata = dictionary_2r.keys()
    for lemma in lemmata:
        if lemma not in dictionary_1r:
            print("%2d\t%s" % (dictionary_2r[lemma], lemma))

Exemple #2

0

Afficher le fichier

Fichier : lemmatize.py Projet : cwf2/mta_summer_2018

def lemmanade(lines):

    count = 0
    lemons = []

    # initialize cltk tools
    #jvReplace = JVReplacer()
    wordTokenizer = WordTokenizer('latin')
    lemmatizer = LemmaReplacer('latin')

    for verse in lines:

        count = count + 1

        # lowercase
        #verse = jvReplace.replace(verse.lower())

        #tokenize the words
        chunkTok = wordTokenizer.tokenize(verse.lower())
        chunkTok = [
            whiteTok(tok) for tok in chunkTok if whiteTok(tok) is not None
        ]

        #lemmatize the tokens
        lemmata = lemmatizer.lemmatize(chunkTok)

        #add all the lemmatized tokens together in a string
        lemons.append(lemmata)

    return lemons

Exemple #3

0

Afficher le fichier

def declineEachWordInList(word_list, error_type_context,logger):
    """
    A declension function which might not be needed anymore.
    """
    error_type = error_type_context + "-declension"
    error_count = 0
    total_list_length = len(word_list)

    words_string = ' '.join(word_list)
    normalized_string = normalizeLatinWordsInNonstandardGlyphs(words_string)
    jv_replaced_string = jv_replace(normalized_string)
    word_list = jv_replaced_string.split()
    lemmatizer = LemmaReplacer('latin')
    try:
        word_list = lemmatizer.lemmatize(word_list)
    except:
        print("Lemmatization error with " + word)
        #logger.addToLogger(error_type, "WARNING", "lemmatization failed: " + word)
        error_count += 1

    decliner = CollatinusDecliner()
    declined_forms = []
    for word in word_list:
        try:
            declined = decliner.decline(word, flatten=True)
            declined_forms = declined_forms + declined
        except:
            logger.addToLogger(error_type, "WARNING", "Lemma couldn't be declined: " + word)
            error_count += 1

    print('[' + str(error_count) + ' / ' + str(total_list_length) + '] declension errors')
    return declined_forms

Exemple #4

0

Afficher le fichier

def declineWord(word, error_type_context,logger):
    """
    A declension function which might not be needed anymore.
    """
    error_type = error_type_context + "-declension"

    normalized_string = normalizeLatinWordsInNonstandardGlyphs(word)
    jv_replaced_string = jv_replace(normalized_string)
    lemmatizer = LemmaReplacer('latin')
    try:
        word_list = lemmatizer.lemmatize(word_list)
    except:
        print("Lemmatization error with " + word)
        #logger.addToLogger(error_type, "WARNING", "lemmatization failed: " + word)
        error_count += 1

    decliner = CollatinusDecliner()
    declined_forms = []
    for word in word_list:
        try:
            declined = decliner.decline(word, flatten=True)
            declined_forms = declined_forms + declined
        except:
            logger.addToLogger(error_type, "WARNING", "Lemma couldn't be declined: " + word)
            error_count += 1

    return declined_forms

Exemple #5

0

Afficher le fichier

Fichier : test_stem.py Projet : vierth/cltk

 def test_lemmatizer_instr_outlemma_latin(self):
     """Test the Latin lemmatizer.
     """
     replacer = LemmaReplacer('latin')
     unlemmatized = 'hominum divomque voluptas'
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=True, return_string=False)
     target = ['hominum/h**o', 'divomque/divus', 'voluptas/voluptas']
     self.assertEqual(lemmatized, target)

Exemple #6

0

Afficher le fichier

Fichier : test_stem.py Projet : vierth/cltk

 def test_lemmatizer_inlist_outlemma_outstring_latin(self):
     """Test the Latin lemmatizer.
     """
     replacer = LemmaReplacer('latin')
     unlemmatized = ['hominum', 'divomque', 'voluptas']
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=True, return_string=True)
     target = 'hominum/h**o divomque/divus voluptas/voluptas'
     self.assertEqual(lemmatized, target)

Exemple #7

0

Afficher le fichier

Fichier : test_stem.py Projet : vierth/cltk

 def test_lemmatizer_instr_greek(self):
     """Test the Greek lemmatizer.
     """
     replacer = LemmaReplacer('greek')
     unlemmatized = 'τὴν διάγνωσιν ἔρχεσθαι'
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=False, return_string=False)
     target = ['τὴν', 'διάγνωσις', 'ἔρχομαι']
     self.assertEqual(lemmatized, target)

Exemple #8

0

Afficher le fichier

Fichier : test_stem.py Projet : manu-chroma/cltk

 def test_lemmatizer_instr_greek(self):
     """Test the Greek lemmatizer.
     """
     replacer = LemmaReplacer('greek')
     unlemmatized = 'τὴν διάγνωσιν ἔρχεσθαι'
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=False, return_string=False)
     target = ['τὴν', 'διάγνωσις', 'ἔρχομαι']
     self.assertEqual(lemmatized, target)

Exemple #9

0

Afficher le fichier

Fichier : test_stem.py Projet : manu-chroma/cltk

 def test_lemmatizer_inlist_outlemma_outstring_latin(self):
     """Test the Latin lemmatizer.
     """
     replacer = LemmaReplacer('latin')
     unlemmatized = ['hominum', 'divomque', 'voluptas']
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=True, return_string=True)
     target = 'hominum/h**o divomque/divus voluptas/voluptas'
     self.assertEqual(lemmatized, target)

Exemple #10

0

Afficher le fichier

Fichier : test_stem.py Projet : manu-chroma/cltk

 def test_lemmatizer_instr_outlemma_latin(self):
     """Test the Latin lemmatizer.
     """
     replacer = LemmaReplacer('latin')
     unlemmatized = 'hominum divomque voluptas'
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=True, return_string=False)
     target = ['hominum/h**o', 'divomque/divus', 'voluptas/voluptas']
     self.assertEqual(lemmatized, target)

Exemple #11

0

Afficher le fichier

Fichier : test_stem.py Projet : manu-chroma/cltk

 def test_lemmatizer_inlist_latin(self):
     """Test the Latin lemmatizer.
     """
     replacer = LemmaReplacer('latin')
     unlemmatized = ['hominum', 'divomque', 'voluptas']
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=False, return_string=False)
     target = ['h**o', 'divus', 'voluptas']
     self.assertEqual(lemmatized, target)

Exemple #12

0

Afficher le fichier

Fichier : test_stem.py Projet : vierth/cltk

 def test_lemmatizer_instr_outstring_latin(self):
     """Test the Latin lemmatizer.
     """
     replacer = LemmaReplacer('latin')
     unlemmatized = 'hominum divomque voluptas'
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=False, return_string=True)
     target = 'h**o divus voluptas'
     self.assertEqual(lemmatized, target)

Exemple #13

0

Afficher le fichier

Fichier : test_stem.py Projet : vierth/cltk

 def test_lemmatizer_inlist_latin(self):
     """Test the Latin lemmatizer.
     """
     replacer = LemmaReplacer('latin')
     unlemmatized = ['hominum', 'divomque', 'voluptas']
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=False, return_string=False)
     target = ['h**o', 'divus', 'voluptas']
     self.assertEqual(lemmatized, target)

Exemple #14

0

Afficher le fichier

Fichier : test_stem.py Projet : vierth/cltk

 def test_lemmatizer_inlist_outlemma_greek(self):
     """Test the Greek lemmatizer.
     """
     replacer = LemmaReplacer('greek')
     unlemmatized = ['τὴν', 'διάγνωσιν', 'ἔρχεσθαι']
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=True, return_string=False)
     target = ['τὴν/τὴν', 'διάγνωσιν/διάγνωσις', 'ἔρχεσθαι/ἔρχομαι']
     self.assertEqual(lemmatized, target)

Exemple #15

0

Afficher le fichier

Fichier : test_stem.py Projet : AviAsh/cltk

 def test_lemmatizer_inlist_outstring_greek(self):
     """Test the Greek lemmatizer.
     """
     replacer = LemmaReplacer('greek')
     unlemmatized = ['τὴν', 'διάγνωσιν', 'ἔρχεσθαι']
     lemmatized = replacer.lemmatize(unlemmatized, return_lemma=False, return_string=True)
     target = 'τὴν διάγνωσις ἔρχομαι'
     self.assertEqual(lemmatized, target)

Exemple #16

0

Afficher le fichier

Fichier : test_stem.py Projet : manu-chroma/cltk

 def test_lemmatizer_inlist_outlemma_greek(self):
     """Test the Greek lemmatizer.
     """
     replacer = LemmaReplacer('greek')
     unlemmatized = ['τὴν', 'διάγνωσιν', 'ἔρχεσθαι']
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=True, return_string=False)
     target = ['τὴν/τὴν', 'διάγνωσιν/διάγνωσις', 'ἔρχεσθαι/ἔρχομαι']
     self.assertEqual(lemmatized, target)

Exemple #17

0

Afficher le fichier

Fichier : test_stem.py Projet : vierth/cltk

 def test_lemmatizer_instr_outlemma_outstring_greek(self):
     """Test the Greek lemmatizer.
     """
     replacer = LemmaReplacer('greek')
     unlemmatized = 'τὴν διάγνωσιν ἔρχεσθαι'
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=True, return_string=True)
     target = 'τὴν/τὴν διάγνωσιν/διάγνωσις ἔρχεσθαι/ἔρχομαι'
     self.assertEqual(lemmatized, target)

Exemple #18

0

Afficher le fichier

Fichier : test_stem.py Projet : manu-chroma/cltk

 def test_lemmatizer_instr_outlemma_outstring_greek(self):
     """Test the Greek lemmatizer.
     """
     replacer = LemmaReplacer('greek')
     unlemmatized = 'τὴν διάγνωσιν ἔρχεσθαι'
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=True, return_string=True)
     target = 'τὴν/τὴν διάγνωσιν/διάγνωσις ἔρχεσθαι/ἔρχομαι'
     self.assertEqual(lemmatized, target)

Exemple #19

0

Afficher le fichier

Fichier : cltkController.py Projet : myrto1984/e-linguistics-app

def lemmatize():
    req_data = request.get_json()
    if req_data and req_data['input_text']:
        input_text = req_data['input_text']
        lemmatizer = LemmaReplacer('greek')
        return jsonify(lemmatizer.lemmatize(input_text))

    return jsonify({})

Exemple #20

0

Afficher le fichier

Fichier : test_stem.py Projet : manu-chroma/cltk

 def test_lemmatizer_instr_outstring_latin(self):
     """Test the Latin lemmatizer.
     """
     replacer = LemmaReplacer('latin')
     unlemmatized = 'hominum divomque voluptas'
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=False, return_string=True)
     target = 'h**o divus voluptas'
     self.assertEqual(lemmatized, target)

Exemple #21

0

Afficher le fichier

def main():
	jv = JVReplacer();
	more = 0
	lemmatizer = LemmaReplacer('latin');
	word_counts = {};
	lines = open(sys.argv[1]);
	for line in lines:
		words = line.split();
		for i in range(0, len(words)):
			words[i] = jv.replace(remove_punctuation(words[i]).lower());
		for word in words:
			#if word in stops_augmented:
			#	continue;
			if "&" in word:
				continue
			if (len(lemmatizer.lemmatize(word)) == 0):
				more += 1
				continue;

			stem = lemmatizer.lemmatize(word)[0];
			if not stem in word_counts:
				word_counts[stem] = 1;
			else:
				word_counts[stem] = word_counts[stem] + 1;
	words_to_show = 400
	sorted_words = sorted(word_counts.items(), key=operator.itemgetter(1), reverse=True);
	top_words = [word[0] for word in sorted_words][0:words_to_show];
	word_freqs = [word[1] for word in sorted_words][0:words_to_show];

	for i in range(0, words_to_show):
		print(str(i) + " " + top_words[i] + " " + str(word_freqs[i]));
	count = 0;
	for i in range(0, words_to_show):
		count += word_freqs[i];
	print(str(count));
	print(str(more))


	s = np.arange(0.0, words_to_show, 1);
	t = word_freqs;
	plt.plot(s, t);
	plt.ylabel('# appearances in Elegiae');
	plt.xlabel('rank of word frequency');
	plt.show();

Exemple #22

0

Afficher le fichier

Fichier : test.py Projet : decretist/Sand

def main():
    corpus_importer = CorpusImporter('latin')
    corpora_list = corpus_importer.list_corpora
    print(corpora_list)
    corpus_importer.import_corpus('latin_models_cltk')
    sentence = 'Aeneadum genetrix, hominum divomque voluptas, alma Venus, caeli subter labentia signa quae mare navigerum, quae terras frugiferentis concelebras, per te quoniam genus omne animantum concipitur visitque exortum lumina solis.'
    sentence = sentence.lower()
    lemmatizer = LemmaReplacer('latin')
    lemmatized_sentence = lemmatizer.lemmatize(sentence)
    print(lemmatized_sentence)

Exemple #23

0

Afficher le fichier

 def test_lemmatizer_inlist_outstring_greek(self):
     """Test the Greek lemmatizer.
     """
     replacer = LemmaReplacer('greek')
     unlemmatized = ['τὴν', 'διάγνωσιν', 'ἔρχεσθαι']
     lemmatized = replacer.lemmatize(unlemmatized,
                                     return_lemma=False,
                                     return_string=True)
     target = 'τὴν διάγνωσις ἔρχομαι'
     self.assertEqual(lemmatized, target)

Exemple #24

0

Afficher le fichier

def lemmatizeList(list):
    tagger = POSTag('greek')

    lemmatizer = LemmaReplacer('greek')
    lemmWords = lemmatizer.lemmatize(list)

    # Remove Stopwords and numbers and lowercases all words.
    lemmWords = [w.lower() for w in lemmWords if not w in STOPS_LIST]
    lemmWords = removeNumbers(lemmWords)

    return lemmWords

Exemple #25

0

Afficher le fichier

Fichier : CltkOperator.py Projet : latex-ninja/cantusNlp-py

 def lemmatizeLat(self,
                  tokenized_words: list,
                  return_raw: bool = False) -> ([str]) or ([str], [str]):
     """
     Lemmatizes given list of words against the cltk perseus corpus. If second parameter
     is set to true -> returns a list with words BUT additionally with derived "source-word" after
     a "/" seperator.
     :param tokenized_words: String list of words to be lemmatized.
     :param return_raw: Boolean, decides if return should contain raw "source word" or not.
     :return: First index position -> List of lemmmatas; Second index position -_> if second parameter was true
     then list of lemmatas with "source_words" attached to each lemmata string BEFORE the lemmatized word.
     """
     lemmatizer = LemmaReplacer('latin')
     lemmata: [str] = lemmatizer.lemmatize(tokenized_words, False)
     if return_raw:
         lemmata_with_source: [str] = lemmatizer.lemmatize(
             tokenized_words, True)
         return lemmata, lemmata_with_source
     else:
         return lemmata

Exemple #26

0

Afficher le fichier

Fichier : translate.py Projet : baileymiller/intertextualityProject

def get_lemma(input_words, language):
    lang = None

    if language == "Latin":
        lemmatizer = LemmaReplacer("latin")

        # Required for CLTK module
        input_words = latin_lem_replacement(input_words)

    if language == "Greek":
        lemmatizer = LemmaReplacer("greek")

    if type(input_words) == list:
        results = lemmatizer.lemmatize(input_words)
        return results
    else:
        input_words = normalize_word(input_words)
        results = lemmatizer.lemmatize(input_words)
        if len(results) > 0:
            return results[0]
        else:
            return input_words

Exemple #27

0

Afficher le fichier

def lemmatizeWord(word):
    """
    CLTK-based lemmatization function to lemmatize a single word.

    Since CLTK lemmatization always returns a list, it will only return the
    first element of that list. If you want the whole list or lemmatize more
    than one word, use lemmatizeAllWordsFromList.
    This function has no error checking in form of try-catch or anything.
    It's possible that lemmatization fails and thus, the returned string is empty.
    """
    lemmatizer = LemmaReplacer('latin')
    result = lemmatizer.lemmatize(word)
    # always returns list
    return result[0]

Exemple #28

0

Afficher le fichier

def tokenize(text, language="latin"):
    jv_replacer = JVReplacer()
    text = jv_replacer.replace(text.lower())

    t = WordTokenizer(language)
    l = LemmaReplacer(language)

    text_word_tokens = t.tokenize(text)

    # Garde les mots de plus de trois characters
    ## text_word_tokens = [token for token in text_word_tokens if token not in ['.', ',', ':', ';','*']]
    text_word_tokens = [token for token in text_word_tokens if len(token) > 3]

    text_word_tokens = l.lemmatize(text_word_tokens)

    return text_word_tokens

Exemple #29

0

Afficher le fichier

def preprocess(doc):
    assert (type(doc) == str)
    word_tokenizer = WordTokenizer('latin')
    doc_word_tokens = word_tokenizer.tokenize(doc)
    doc_word_tokens_no_punt = [
        token.lower() for token in doc_word_tokens
        if token not in ['.', ',', ':', ';']
    ]

    # lemmeatization
    corpus_importer = CorpusImporter('latin')
    corpus_importer.import_corpus('latin_models_cltk')
    jv_replacer = JVReplacer()

    lemmatizer = LemmaReplacer('latin')
    lemmata = lemmatizer.lemmatize(" ".join(doc_word_tokens_no_punt))
    cleaned = remove_latin_library_items(" ".join(lemmata))
    return cleaned

Exemple #30

0

Afficher le fichier

Fichier : Class_traditions.py Projet : Ycreak/Zweiquellentheorie_PCA

    def lemmatizeList(self, lines):
        from cltk.corpus.utils.formatter import cltk_normalize

        tagger = POSTag('greek')

        lemmatizer = LemmaReplacer('greek')

        # can help when using certain texts (doc says it, so i does it)
        lines = cltk_normalize(lines)

        # print(lines)
        # exit(0)
        lines = lemmatizer.lemmatize(lines)

        # Remove Stopwords and numbers and lowercases all words.
        lines = [w.lower() for w in lines if not w in STOPS_LIST]
        # lemmWords = removeNumbers(lemmWords)

        return ' '.join(lines)

Exemple #31

0

Afficher le fichier

Fichier : setup_0.init_cltk.py Projet : cwf2/mta_summer_2018

def runTest(text):
   '''Test cltk tools for latin'''
   print('Test phrase:')
   print(' -> ' + text)
   print()

#   print('[1/3] Testing JVReplacer')
#   jv = JVReplacer()
#   text = jv.replace(text)
#   print(' -> ' + text)
#   print()

   print('[2/3] Testing WordTokenizer')
   tokenizer = WordTokenizer('latin')
   tok = tokenizer.tokenize(text)
   print(' -> ' + ', '.join(["'{}'".format(t) for t in tok]))
   print()

   print('[3/3] Testing LemmaReplacer')
   lemmatizer = LemmaReplacer('latin')
   lem = lemmatizer.lemmatize(tok)
   print(' -> ' + ', '.join(["'{}'".format(l) for l in lem]))
   print()

Exemple #32

0

Afficher le fichier

Fichier : doc2vec.py Projet : bwhicks/pliny-data

def get_docs(letters):

    docs = []
    count = 0
    for i, entry in enumerate(letters):
        letter, tag = entry
        NO_PUNCT_RE = re.compile(r'[?!\.\'\"<>():;,]')
        replacer = JVReplacer()
        letter = replacer.replace(letter)
        words = re.sub(NO_PUNCT_RE, '', letter).lower().split()

        for i, word in enumerate(words):
            if word.endswith('-'):
                words[i + 1] = '%s%s' % (word.strip('-'), words[i + 1])
        words = [w for w in words if not w.endswith('-')]
        words = [w for w in words if w not in STOPS_LIST]
        words = ' '.join(words)
        lemmatizer = LemmaReplacer('latin')
        words = lemmatizer.lemmatize(words)
        count += len(words)
        doc = TaggedDocument(words, [tag])
        docs.append(doc)
    return docs

Exemple #33

0

Afficher le fichier

Fichier : per_letter_topics.py Projet : bwhicks/pliny-data

def pre_process(letters):

    pre_processed = []
    for letter in letters:
        NO_PUNCT_RE = re.compile(r'[?!\.\'\"<>():;,]')
        replacer = JVReplacer()
        letter = replacer.replace(letter)
        words = re.sub(NO_PUNCT_RE, '', letter).lower().split()

        for i, word in enumerate(words):
            if word.endswith('-'):
                words[i + 1] = '%s%s' % (word.strip('-'), words[i + 1])
        words = [w for w in words if not w.endswith('-')]
        words = [w for w in words if w not in STOPS_LIST]
        words = ' '.join(words)
        lemmatizer = LemmaReplacer('latin')
        words = lemmatizer.lemmatize(words)
        # very common words that seemed to be cofounding the topic model
        words = [
            w for w in words if w not in ['magnus', 'bonus', 'ago', 'valeo']
        ]
        pre_processed.append(words)
    return pre_processed

Exemple #34

0

Afficher le fichier

Fichier : sophocles_script.py Projet : TylerKirby/JupyterNotebooks

def lemmata(text):
    lemmatizer = LemmaReplacer('greek')
    return [word for word in set(lemmatizer.lemmatize(text.lower())) if not word in STOPS_LIST]

Exemple #35

0

Afficher le fichier

Fichier : word2vec.py Projet : vierth/cltk

def gen_docs(corpus, lemmatize, rm_stops):
    """Open and process files from a corpus. Return a list of sentences for an author. Each sentence
    is itself a list of tokenized words.
    """

    assert corpus in ['phi5', 'tlg']

    if corpus == 'phi5':
        language = 'latin'
        filepaths = assemble_phi5_author_filepaths()
        jv_replacer = JVReplacer()
        text_cleaner = phi5_plaintext_cleanup
        word_tokenizer = nltk_tokenize_words
        if rm_stops:
            stops = latin_stops
        else:
            stops = None
    elif corpus == 'tlg':
        language = 'greek'
        filepaths = assemble_tlg_author_filepaths()
        text_cleaner = tlg_plaintext_cleanup
        word_tokenizer = nltk_tokenize_words

        if rm_stops:
            stops = latin_stops
        else:
            stops = None

    if lemmatize:
        lemmatizer = LemmaReplacer(language)

    sent_tokenizer = TokenizeSentence(language)

    for filepath in filepaths:
        with open(filepath) as f:
            text = f.read()
        # light first-pass cleanup, before sentence tokenization (which relies on punctuation)
        text = text_cleaner(text, rm_punctuation=False, rm_periods=False)
        sent_tokens = sent_tokenizer.tokenize_sentences(text)
        # doc_sentences = []
        for sentence in sent_tokens:
            # a second cleanup at sentence-level, to rm all punctuation
            sentence = text_cleaner(sentence,
                                    rm_punctuation=True,
                                    rm_periods=True)
            sentence = word_tokenizer(sentence)
            sentence = [s.lower() for s in sentence]
            sentence = [w for w in sentence if w]
            if language == 'latin':
                sentence = [
                    w[1:] if w.startswith('-') else w for w in sentence
                ]

            if stops:
                sentence = [w for w in sentence if w not in stops]

            sentence = [w for w in sentence if len(w) > 1]  # rm short words

            if sentence:
                sentence = sentence

            if lemmatize:
                sentence = lemmatizer.lemmatize(sentence)
            if sentence and language == 'latin':
                sentence = [jv_replacer.replace(word) for word in sentence]
            if sentence:
                yield sentence

Exemple #36

0

Afficher le fichier

    from cltk.stem.lemma import LemmaReplacer
    from cltk.tag.pos import POSTag
    lemmatizer = LemmaReplacer('greek')
    tagger = POSTag('greek')
else:
    import spacy
    nlp = spacy.load("en_core_web_trf")  # English

result = []
count = 0
for sentence in data:
    sentence = re.sub(
        r"[\.\?·;]\s*$", '', sentence
    )  # remove sentence-ending punctuation; all other punctuation has already been removed
    if language == 'grc':
        lemmas = lemmatizer.lemmatize(sentence)
        tagged = tagger.tag_tnt(sentence)
        tagged = [[w[1], w[0]] for w in tagged if not cltk_ignored(w[1], w[0])]
        a = []
        i = 0
        for w in lemmas:
            if i >= len(tagged):
                break
            pos = tagged[i][0]
            a.append([
                tagged[i][1], lemmas[i],
                cltk_pos_code_to_pos(pos), f"cltk:{pos}"
            ])  # original, lemma, part of speech, cltk part of speech
            i = i + 1
        if len(lemmas) != len(tagged):
            print(lemmas, "\n", tagged, "\n", len(lemmas), len(tagged), "\n",

Exemple #37

0

Afficher le fichier

def main():
    corpus = tokenize_corpus(load_pliny_corpus())

    replacer = JVReplacer()
    corpus = replacer.replace(corpus)

    NO_PUNCT_RE = re.compile(r'[?!\.\'\"<>():;,]')

    sentences = [
        re.sub(NO_PUNCT_RE, '', sentence).lower()
        for sentence in corpus.split('\n')
    ]

    # lemmatize the words

    lemmatizer = LemmaReplacer('latin')
    words_by_sentence = [lemmatizer.lemmatize(sent) for sent in sentences]
    all_words = [item for sublist in words_by_sentence for item in sublist]
    word_map = {}
    unk_count = 0

    count = [['UNK', -1]]
    count.extend(collections.Counter(all_words).most_common(299))
    for word, _ in count:
        word_map[word] = len(word_map)
    for word in all_words:
        index = word_map.get(word, 0)
        if index == 0:
            unk_count += 1
    count[0][1] = unk_count
    reverse_word_map = dict(zip(word_map.values(), word_map.keys()))

    data = []
    WINDOW_SIZE = 5

    for sentence in words_by_sentence:
        for i, word in enumerate(sentence):
            for nb_word in \
                sentence[max(i - WINDOW_SIZE, 0):min(i + WINDOW_SIZE, (len(sentence) + 1))]:
                if nb_word != word:
                    data.append(
                        [word_map.get(word, 0),
                         word_map.get(nb_word, 0)])
    x_train = []
    y_train = []

    for word in data:
        x_train.append(to_one_hot(word[0], 300))
        y_train.append(to_one_hot(word[1], 300))

    x_train = np.asarray(x_train)
    y_train = np.asarray(y_train)

    x = tf.placeholder(tf.float32, shape=(None, 300))
    y_label = tf.placeholder(tf.float32, shape=(None, 300))

    W1 = tf.Variable(tf.random_normal([300, 5]))
    b1 = tf.Variable(tf.random_normal([5]))

    hidden_representation = tf.add(tf.matmul(x, W1), b1)

    W2 = tf.Variable(tf.random_normal([5, 300]))
    b2 = tf.Variable(tf.random_normal([300]))
    prediction = tf.nn.softmax(tf.add(tf.matmul(hidden_representation, W2),
                                      b2))

    with tf.Session() as sess:
        init = tf.global_variables_initializer()

        sess.run(init)  #make sure you do this!
        # define the loss function:
        cross_entropy_loss = tf.reduce_mean(-tf.reduce_sum(
            y_label * tf.log(prediction), reduction_indices=[1]))
        # define the training step:
        train_step = tf.train.GradientDescentOptimizer(0.1).minimize(
            cross_entropy_loss)
        n_iters = 10000
        # train for n_iter iterations
        for _ in range(n_iters):

            start = np.random.randint(0, 700)
            end = start + 700
            if end > len(x_train) - 1:
                end = len(x_train) - 1
                start = len(x_train) - 701

            sess.run(train_step,
                     feed_dict={
                         x: x_train[start:end],
                         y_label: y_train[start:end]
                     })
            print(
                'loss is : ',
                sess.run(cross_entropy_loss,
                         feed_dict={
                             x: x_train[start:end],
                             y_label: y_train[start:end]
                         }))

        vectors = sess.run(W1 + b1)
        with open('word2int.pickle', 'wb') as fp:
            pickle.dump(word_map, fp)
        with open('int2word.pickle', 'wb') as fp:
            pickle.dump(reverse_word_map, fp)
        with open('vectors.pickle', 'wb') as fp:
            pickle.dump(vectors, fp)

Exemple #38

0

Afficher le fichier

        #circumflex:
        sentences[i] = re.sub(r'ô', r'o', sentences[i])
        sentences[i] = re.sub(r'î', r'i', sentences[i])
        sentences[i] = re.sub(r'â', r'a', sentences[i])
        sentences[i] = re.sub(r'û', r'u', sentences[i])
        sentences[i] = re.sub(r'ê', r'e', sentences[i])
        sentences[i] = re.sub(r'ŷ', r'y', sentences[i])

        # grave accent:

        sentences[i] = re.sub(r'à', r'a', sentences[i])
        sentences[i] = re.sub(r'è', r'e', sentences[i])
        sentences[i] = re.sub(r'æ̀', r'ae', sentences[i])
        sentences[i] = re.sub(r'ì', r'i', sentences[i])
        sentences[i] = re.sub(r'ò', r'o', sentences[i])
        sentences[i] = re.sub(r'ù', r'u', sentences[i])
        sentences[i] = re.sub(r'ỳ', r'y', sentences[i])

        # ligatures:
        sentences[i] = re.sub(r'æ', r'ae', sentences[i])
        sentences[i] = re.sub(r'œ', r'oe', sentences[i])

        lemmatized_sentence = " ".join(lemmatizer.lemmatize(sentences[i]))
        lemmatized_sentence = re.sub(r'\.', r'. ', lemmatized_sentence)
        lemmatized_sentence = re.sub(r'\?', r'? ', lemmatized_sentence)
        lemmatized_sentence = re.sub(r'\!', r'! ', lemmatized_sentence)
        #print("lemmatized sentence:", lemmatized_sentence)

        # Write to output files:
        output.write(lemmatized_sentence)

Exemple #39

0

Afficher le fichier

Fichier : lemmatizer.py Projet : PonteIneptique/Siena-2015

 def getLemma(self):
   lemmatizer = LemmaReplacer('latin')
   return lemmatizer.lemmatize(self.text)

Exemple #40

0

Afficher le fichier

Fichier : cltkUrsus.py Projet : paolomonella/ursus

lemmatizer = LemmaReplacer('latin')
tagger = POSTag('latin')
j = JVReplacer()

# Parse XML

xmldoc = parse('/home/ilbuonme/siti/paolo.monella/ursus/casanatensis.xml')
#xmldoc = parse('/home/ilbuonme/siti/paolo.monella/ursus/shorter_casanatensis.xml')
wordElementList = xmldoc.getElementsByTagName('w')

for w in wordElementList:
        form = w.attributes['ana'].value
        print(form)
        # Parse the inflected word
        try:
            lemmaList = lemmatizer.lemmatize(form.lower())
            lemma = lemmaList[0].replace('v', 'u')
            posList   = tagger.tag_tnt(j.replace(form.lower()))
            pos = posList[0][1]
            w.setAttribute('n', form)
            w.setAttribute('lemma', lemma)
            w.setAttribute('ana', pos)
        except:
            raise

"""
with open('output.xml', 'w') as f:
    f = codecs.lookup("utf-8")[3](f)
    xmldoc.writexml(f, encoding="utf-8")
"""

Exemple #41

0

Afficher le fichier

Fichier : app.py Projet : bblumenfelder/cltk_api

def lemmatize(form):
    lemmatizer = LemmaReplacer('latin')
    list = lemmatizer.lemmatize(form)
    return json.dumps(list)

Exemple #42

0

Afficher le fichier

Fichier : lemmatiser_cltk2.py Projet : durian/lemmatiser

    else:
        assert False, "unhandled option"

if not filename:
    sys.exit(0)
    
outfilename = filename + ".CLTK-wlt.txt"
lc = 0

with open(filename, 'r') as f:
    with open(outfilename, 'w') as of:
        for l in f:
            l = l.strip()
            bits = l.split()
            if len(bits) != 3:
                continue
            w = normalize('NFC', bits[0])
            l = normalize('NFC', bits[1])
            t = bits[2]
            lemma = cltk_lemmatiser.lemmatize( w )[0]
            #tag   = cltk_tagger.tag_ngram_123_backoff( w )[0]
            tag   = cltk_tagger.tag_tnt( w )[0]
            # tags are all caps
            # καὶ [('καὶ', 'C--------')]
            # δι’ [('δι', None), ('’', '---------')]
            if '#' in lemma:
                hidx = lemma.find('#')
                lemma = lemma[0:hidx]
            print( w, "\t", lemma, "\t", tag[1], file=of )
            lc += 1

Exemple #43

0

Afficher le fichier

Fichier : word2vec.py Projet : cltk/cltk

def gen_docs(corpus, lemmatize, rm_stops):
    """Open and process files from a corpus. Return a list of sentences for an author. Each sentence
    is itself a list of tokenized words.
    """

    assert corpus in ['phi5', 'tlg']

    if corpus == 'phi5':
        language = 'latin'
        filepaths = assemble_phi5_author_filepaths()
        jv_replacer = JVReplacer()
        text_cleaner = phi5_plaintext_cleanup
        word_tokenizer = WordTokenizer('latin')
        if rm_stops:
            stops = latin_stops
        else:
            stops = None
    elif corpus == 'tlg':
        language = 'greek'
        filepaths = assemble_tlg_author_filepaths()
        text_cleaner = tlg_plaintext_cleanup
        word_tokenizer = WordTokenizer('greek')

        if rm_stops:
            stops = latin_stops
        else:
            stops = None

    if lemmatize:
        lemmatizer = LemmaReplacer(language)

    sent_tokenizer = TokenizeSentence(language)

    for filepath in filepaths:
        with open(filepath) as f:
            text = f.read()
        # light first-pass cleanup, before sentence tokenization (which relies on punctuation)
        text = text_cleaner(text, rm_punctuation=False, rm_periods=False)
        sent_tokens = sent_tokenizer.tokenize_sentences(text)
        # doc_sentences = []
        for sentence in sent_tokens:
            # a second cleanup at sentence-level, to rm all punctuation
            sentence = text_cleaner(sentence, rm_punctuation=True, rm_periods=True)
            sentence = word_tokenizer(sentence)
            sentence = [s.lower() for s in sentence]
            sentence = [w for w in sentence if w]
            if language == 'latin':
                sentence = [w[1:] if w.startswith('-') else w for w in sentence]

            if stops:
                sentence = [w for w in sentence if w not in stops]

            sentence = [w for w in sentence if len(w) > 1]  # rm short words

            if sentence:
                sentence = sentence

            if lemmatize:
                sentence = lemmatizer.lemmatize(sentence)
            if sentence and language == 'latin':
                sentence = [jv_replacer.replace(word) for word in sentence]
            if sentence:
                yield sentence

Exemple #44

0

Afficher le fichier

Fichier : orig-myCLTK.py Projet : paolomonella/ursus


lemmatizer = LemmaReplacer('latin')
tagger = POSTag('latin')
j = JVReplacer()

text = []
#text = ['Gallia', 'est', 'omnis', 'divisa', 'in', 'partes', 'tres']
with open('/home/ilbuonme/siti/paolo.monella/ursus/lemma/recycleBin/textForOrig-myCLTK.txt', 'r') as f:
    for x in f.readlines():
        for w in x.split(' '):
            text.append(w)

for t in text:
    if t:
        # Note: the tagger likes 'divisa', while the lemmatizer likes 'diuisa'
        lemmaList = lemmatizer.lemmatize(t.lower())
        posList   = tagger.tag_tnt(j.replace(t.lower()))
        form = posList[0][0]
        lemma = lemmaList[0].replace('v', 'u')
        pos = posList[0][1]
        print('<w n="' + form + '" lemma="'+lemma + '" ana="' + pos + '">')

"""
# Apprently j.replace... makes it worse for the POS tagger, so I'm dropping it in this case
text = j.replace(text.lower())
forms = tagger.tag_unigram(text.lower())
for fm in forms:
    print(fm)
"""

Exemple #45

0

Afficher le fichier

def lemmata(text):
    lemmatizer = LemmaReplacer('greek')
    return [
        word for word in set(lemmatizer.lemmatize(text.lower()))
        if not word in STOPS_LIST
    ]

Exemple #46

0

Afficher le fichier

#!/bin/python3

from cltk.stem.lemma import LemmaReplacer
from cltk.corpus.utils.formatter import cltk_normalize

lemmatizer = LemmaReplacer('greek')

text = """
μῆνιν ἄειδε θεὰ Πηληϊάδεω ̓Αχιλῆος
οὐλομένην, ἣ μυρί' ̓Αχαιοῖς ἄλγε' ἔθηκε,
πολλὰς δ' ἰφθίμους ψυχὰς ̓́Αϊδι προί̈αψεν
ἡρώων, αὐτοὺς δὲ ἑλώρια τεῦχε κύνεσσιν
οἰωνοῖσί τε πᾶσι, Διὸς δ' ἐτελείετο βουλή,
ἐξ οὗ δὴ τὰ πρῶτα διαστήτην ἐρίσαντε
Ατρεί̈δης τε ἄναξ ἀνδρῶν καὶ δῖος ̓Αχιλλεύς.
"""

#print(lemmatizer.lemmatize(text))
print(lemmatizer.lemmatize("Μῆνιν ἄειδε, θεά"))
print(
    lemmatizer.lemmatize(
        cltk_normalize("μῆνιν ἄειδε θεὰ Πηληϊάδεω ̓Αχιλῆος")))
# ... doesn't work without the normalization