Beispiel #1
0
 def annotate_important_features(self, text, features):
     toks = text.split()    
     features = self.trim_feature_prefixes(features)
     result = []
     for tok in toks:
         variants = [tok] + [an['lemma'] for an in analyze([tok])[0]['analysis']]
         variants = [self._unifier.unify(v.lower()) for v in variants]
         match = self.does_match(variants, features)
         if match is not None:
             tok = self.annotate_color(tok, match)
         result.append(tok)
     return ' '.join(result)
Beispiel #2
0
def morphy(word):
    """Performs morphological analysis on the `word`.

    Parameters
    ----------
    word : str
      Word to be lemmatized.

    Returns
    -------
    str
      Lemma of the `word`.

    """
    analyzed = analyze([word])
    return analyzed[-1]['analysis'][0]['lemma'] if len(analyzed) else None
Beispiel #3
0
def morphy(word):
    """Performs morphological analysis on the `word`.

    Parameters
    ----------
    word : str
      Word to be lemmatized.

    Returns
    -------
    str
      Lemma of the `word`.

    """
    analyzed = analyze([word])
    return analyzed[-1]['analysis'][0]['lemma'] if len(analyzed) else None
Beispiel #4
0
def return_word_stats(keyword_in,sent):
    out={"POS":None,"form":None,"lemma":None,"ending":None,"root":None,"root_tokens":None}
    analyze_output_current = analyze(sent)
    for val in analyze_output_current:
        est_punctuation = '„”'
        temp = val["text"].lower()
        temp = temp.strip(string.punctuation + est_punctuation)
        if temp == keyword_in.lower():
            analysis = val["analysis"][0]
            out["POS"] = analysis["partofspeech"]
            out["form"] = analysis["form"]
            out["ending"] = analysis["ending"]
            out["lemma"] = analysis['lemma'].split("|")[0]
            out["root"] = analysis["root"]
            out["root_tokens"] = analysis["root_tokens"]
            break
    return out
Beispiel #5
0
def filter_words(word, origsentence, model, stats):
    output = {"similar": [], "synonyms": []}

    unique_words = []

    if word.lower() in model.vocab:
        print("using the wordvec for word ", word.lower())
        similar_words = model.most_similar(word.lower(), topn=200)
    else:
        analyze_output_temp = analyze(origsentence)
        lemma_words = get_lemma_and_root_word(word, analyze_output_temp)
        for each_one in lemma_words:
            if each_one in model.vocab:
                print("using the wordvec for word ", word)
                similar_words = model.most_similar(each_one, topn=200)
                break

    # print ("similar_words ",similar_words)
    for each_word in similar_words:
        current_word = each_word[0]
        unique_words.append(current_word.lower())
    unique_words = list(OrderedDict.fromkeys(unique_words))

    # print ("unique_words ",unique_words)
    # Remove punctuation
    unique_words = remove_punctuation_words(unique_words)

    for unique_word in unique_words:
        if word.lower() in unique_word or unique_word.startswith(stats['root']):
            # unique_word = unique_word.title()
            output["similar"].append(unique_word)
        else:
            # unique_word = unique_word.title()
            output["synonyms"].append(unique_word)

    # text = Text(word)
    # original_word = text.lemmas[0]
    # original_word = original_word.split("|")[0]

    syn, sim, lemma_and_root = get_lemmas(output["synonyms"], word, origsentence)
    output["synonyms"] = syn
    output["similar"].extend(sim)
    output["lemma"] = lemma_and_root
    # pprint.pprint(lemmas)

    return output
Beispiel #6
0
def get_lemmas(filtered_words,originalword,originalsentence):
    out ={}
    out[originalword] = None
    analyze_output = analyze(originalsentence)
    lemmas_and_roots =get_lemma_and_root_word(originalword,analyze_output)
    unique_lemma_words=[]
    extra_lemma_words =[]
    for word in filtered_words:
        text = Text(word)
        try:
            lemma_word = text.roots[0]
            lemma_word = lemma_word.split("|")[0]
            if lemma_word not in out:
                out[lemma_word]=None
                if any(new_word.lower() in lemma_word.lower() for new_word in lemmas_and_roots):
                    extra_lemma_words.append(word)
                else:
                    unique_lemma_words.append(word)
            else:
                extra_lemma_words.append(word)
        except:
            print ("error for lemma word",word)
        # out[word] = lemma_word
    return unique_lemma_words,extra_lemma_words,lemmas_and_roots
# -*- coding: utf-8 -*-
'''Morphological analysis/synthesis example.'''
from __future__ import unicode_literals, print_function

from estnltk import analyze
from pprint import pprint

pprint(analyze('Tüünete öötööde allmaaraudteejaam'))


from estnltk import Tokenizer
from estnltk import PyVabamorfAnalyzer

tokenizer = Tokenizer()
analyzer = PyVabamorfAnalyzer()

text = '''Keeletehnoloogia on arvutilingvistika praktiline pool.
Keeletehnoloogid kasutavad arvutilingvistikas välja töötatud 
teooriaid, et luua rakendusi (nt arvutiprogramme), 
mis võimaldavad inimkeelt arvuti abil töödelda ja mõista. 

Tänapäeval on keeletehnoloogia tuntumateks valdkondadeks 
masintõlge, arvutileksikoloogia, dialoogisüsteemid, 
kõneanalüüs ja kõnesüntees.
'''

# first tokenize and then morphologically analyze
morf_analyzed = analyzer(tokenizer(text))

# print some results
print (morf_analyzed.lemmas)
Beispiel #8
0
literal_regexp = re.compile("\s+2\s+LITERAL\s\"(.+)\"")
sense_regexp = re.compile("\s+3\s+SENSE\s+(\d+)")

with codecs.open('%s'%argv[1],'r',encoding='utf-8') as fin, codecs.open("../synset_to_lemma.txt",'w',encoding='utf-8') as fout:
	for line in fin:
		result = syn_idx_regexp.match(line)
		if result:
			syn_idx = result.group(1)
			continue

		result = pos_regexp.match(line)
		if result:
			pos = result.group(1)
			continue

		result = literal_regexp.match(line)
		if result:
			literal = result.group(1)
			continue

		result = sense_regexp.match(line)
		if result:
			sense = result.group(1)
			lemma_product = analyze([literal])[0]
			for candidate in lemma_product['analysis']:
				form = candidate['form']
				lemma = candidate['lemma']
				cand_pos = candidate['partofspeech']
				
				fout.write("%s@%s:%s:%02d@%s:%s:%s\n"%(syn_idx,pos,literal,int(sense),lemma,form,cand_pos))
Beispiel #9
0
 def _lemmatize(self, sentence):
     return list(set([analysis['lemma'] for wa in analyze(sentence) for analysis in wa['analysis']]))
# -*- coding: utf-8 -*-
'''Morphological analysis/synthesis example.'''
from __future__ import unicode_literals, print_function

from estnltk import analyze
from pprint import pprint

pprint(analyze('Tüünete öötööde allmaaraudteejaam'))

from estnltk import Tokenizer
from estnltk import PyVabamorfAnalyzer

tokenizer = Tokenizer()
analyzer = PyVabamorfAnalyzer()

text = '''Keeletehnoloogia on arvutilingvistika praktiline pool.
Keeletehnoloogid kasutavad arvutilingvistikas välja töötatud 
teooriaid, et luua rakendusi (nt arvutiprogramme), 
mis võimaldavad inimkeelt arvuti abil töödelda ja mõista. 

Tänapäeval on keeletehnoloogia tuntumateks valdkondadeks 
masintõlge, arvutileksikoloogia, dialoogisüsteemid, 
kõneanalüüs ja kõnesüntees.
'''

# first tokenize and then morphologically analyze
morf_analyzed = analyzer(tokenizer(text))

# print some results
print(morf_analyzed.lemmas)
print(morf_analyzed.postags)