Python analyze Beispiele, estnltk.analyze Python Beispiele

Beispiel #1

0

Datei anzeigen

 def annotate_important_features(self, text, features):
     toks = text.split()    
     features = self.trim_feature_prefixes(features)
     result = []
     for tok in toks:
         variants = [tok] + [an['lemma'] for an in analyze([tok])[0]['analysis']]
         variants = [self._unifier.unify(v.lower()) for v in variants]
         match = self.does_match(variants, features)
         if match is not None:
             tok = self.annotate_color(tok, match)
         result.append(tok)
     return ' '.join(result)

Beispiel #2

0

Datei anzeigen

Datei: wn.py Projekt: estnltk/estnltk

def morphy(word):
    """Performs morphological analysis on the `word`.

    Parameters
    ----------
    word : str
      Word to be lemmatized.

    Returns
    -------
    str
      Lemma of the `word`.

    """
    analyzed = analyze([word])
    return analyzed[-1]['analysis'][0]['lemma'] if len(analyzed) else None

Beispiel #3

0

Datei anzeigen

def morphy(word):
    """Performs morphological analysis on the `word`.

    Parameters
    ----------
    word : str
      Word to be lemmatized.

    Returns
    -------
    str
      Lemma of the `word`.

    """
    analyzed = analyze([word])
    return analyzed[-1]['analysis'][0]['lemma'] if len(analyzed) else None

Beispiel #4

0

Datei anzeigen

def return_word_stats(keyword_in,sent):
    out={"POS":None,"form":None,"lemma":None,"ending":None,"root":None,"root_tokens":None}
    analyze_output_current = analyze(sent)
    for val in analyze_output_current:
        est_punctuation = '„”'
        temp = val["text"].lower()
        temp = temp.strip(string.punctuation + est_punctuation)
        if temp == keyword_in.lower():
            analysis = val["analysis"][0]
            out["POS"] = analysis["partofspeech"]
            out["form"] = analysis["form"]
            out["ending"] = analysis["ending"]
            out["lemma"] = analysis['lemma'].split("|")[0]
            out["root"] = analysis["root"]
            out["root_tokens"] = analysis["root_tokens"]
            break
    return out

Beispiel #5

0

Datei anzeigen

def filter_words(word, origsentence, model, stats):
    output = {"similar": [], "synonyms": []}

    unique_words = []

    if word.lower() in model.vocab:
        print("using the wordvec for word ", word.lower())
        similar_words = model.most_similar(word.lower(), topn=200)
    else:
        analyze_output_temp = analyze(origsentence)
        lemma_words = get_lemma_and_root_word(word, analyze_output_temp)
        for each_one in lemma_words:
            if each_one in model.vocab:
                print("using the wordvec for word ", word)
                similar_words = model.most_similar(each_one, topn=200)
                break

    # print ("similar_words ",similar_words)
    for each_word in similar_words:
        current_word = each_word[0]
        unique_words.append(current_word.lower())
    unique_words = list(OrderedDict.fromkeys(unique_words))

    # print ("unique_words ",unique_words)
    # Remove punctuation
    unique_words = remove_punctuation_words(unique_words)

    for unique_word in unique_words:
        if word.lower() in unique_word or unique_word.startswith(stats['root']):
            # unique_word = unique_word.title()
            output["similar"].append(unique_word)
        else:
            # unique_word = unique_word.title()
            output["synonyms"].append(unique_word)

    # text = Text(word)
    # original_word = text.lemmas[0]
    # original_word = original_word.split("|")[0]

    syn, sim, lemma_and_root = get_lemmas(output["synonyms"], word, origsentence)
    output["synonyms"] = syn
    output["similar"].extend(sim)
    output["lemma"] = lemma_and_root
    # pprint.pprint(lemmas)

    return output

Beispiel #6

0

Datei anzeigen

def get_lemmas(filtered_words,originalword,originalsentence):
    out ={}
    out[originalword] = None
    analyze_output = analyze(originalsentence)
    lemmas_and_roots =get_lemma_and_root_word(originalword,analyze_output)
    unique_lemma_words=[]
    extra_lemma_words =[]
    for word in filtered_words:
        text = Text(word)
        try:
            lemma_word = text.roots[0]
            lemma_word = lemma_word.split("|")[0]
            if lemma_word not in out:
                out[lemma_word]=None
                if any(new_word.lower() in lemma_word.lower() for new_word in lemmas_and_roots):
                    extra_lemma_words.append(word)
                else:
                    unique_lemma_words.append(word)
            else:
                extra_lemma_words.append(word)
        except:
            print ("error for lemma word",word)
        # out[word] = lemma_word
    return unique_lemma_words,extra_lemma_words,lemmas_and_roots

Beispiel #7

0

Datei anzeigen

Datei: morf_analysis_synthesis.py Projekt: cbentes/estnltk

# -*- coding: utf-8 -*-
'''Morphological analysis/synthesis example.'''
from __future__ import unicode_literals, print_function

from estnltk import analyze
from pprint import pprint

pprint(analyze('Tüünete öötööde allmaaraudteejaam'))


from estnltk import Tokenizer
from estnltk import PyVabamorfAnalyzer

tokenizer = Tokenizer()
analyzer = PyVabamorfAnalyzer()

text = '''Keeletehnoloogia on arvutilingvistika praktiline pool.
Keeletehnoloogid kasutavad arvutilingvistikas välja töötatud 
teooriaid, et luua rakendusi (nt arvutiprogramme), 
mis võimaldavad inimkeelt arvuti abil töödelda ja mõista. 

Tänapäeval on keeletehnoloogia tuntumateks valdkondadeks 
masintõlge, arvutileksikoloogia, dialoogisüsteemid, 
kõneanalüüs ja kõnesüntees.
'''

# first tokenize and then morphologically analyze
morf_analyzed = analyzer(tokenizer(text))

# print some results
print (morf_analyzed.lemmas)

Beispiel #8

0

Datei anzeigen

Datei: synset_to_lemma.py Projekt: urdvr/estnltk

literal_regexp = re.compile("\s+2\s+LITERAL\s\"(.+)\"")
sense_regexp = re.compile("\s+3\s+SENSE\s+(\d+)")

with codecs.open('%s'%argv[1],'r',encoding='utf-8') as fin, codecs.open("../synset_to_lemma.txt",'w',encoding='utf-8') as fout:
	for line in fin:
		result = syn_idx_regexp.match(line)
		if result:
			syn_idx = result.group(1)
			continue

		result = pos_regexp.match(line)
		if result:
			pos = result.group(1)
			continue

		result = literal_regexp.match(line)
		if result:
			literal = result.group(1)
			continue

		result = sense_regexp.match(line)
		if result:
			sense = result.group(1)
			lemma_product = analyze([literal])[0]
			for candidate in lemma_product['analysis']:
				form = candidate['form']
				lemma = candidate['lemma']
				cand_pos = candidate['partofspeech']
				
				fout.write("%s@%s:%s:%02d@%s:%s:%s\n"%(syn_idx,pos,literal,int(sense),lemma,form,cand_pos))

Beispiel #9

0

Datei anzeigen

Datei: analyzer.py Projekt: stacc-ee/textclassifier

 def _lemmatize(self, sentence):
     return list(set([analysis['lemma'] for wa in analyze(sentence) for analysis in wa['analysis']]))

Beispiel #10

0

Datei anzeigen

Datei: morf_analysis_synthesis.py Projekt: xmichelf/estnltk

# -*- coding: utf-8 -*-
'''Morphological analysis/synthesis example.'''
from __future__ import unicode_literals, print_function

from estnltk import analyze
from pprint import pprint

pprint(analyze('Tüünete öötööde allmaaraudteejaam'))

from estnltk import Tokenizer
from estnltk import PyVabamorfAnalyzer

tokenizer = Tokenizer()
analyzer = PyVabamorfAnalyzer()

text = '''Keeletehnoloogia on arvutilingvistika praktiline pool.
Keeletehnoloogid kasutavad arvutilingvistikas välja töötatud 
teooriaid, et luua rakendusi (nt arvutiprogramme), 
mis võimaldavad inimkeelt arvuti abil töödelda ja mõista. 

Tänapäeval on keeletehnoloogia tuntumateks valdkondadeks 
masintõlge, arvutileksikoloogia, dialoogisüsteemid, 
kõneanalüüs ja kõnesüntees.
'''

# first tokenize and then morphologically analyze
morf_analyzed = analyzer(tokenizer(text))

# print some results
print(morf_analyzed.lemmas)
print(morf_analyzed.postags)