def main(cache_size): """ Sample usage We use cache for recurring words such as pronouns, conjunctions, common verbs, modular and auxiliary verbs. """ analyzer = Analyzer(char_subs_allowed=True) cached = memoize if cache_size == "unlim" else ( lrudecorator(cache_size) if cache_size else (lambda x: x)) analyze = cached(analyzer.analyze) x = analyzer.iter_lexicon_formatted(u"ge") for i in x: print(i)
def expandToken(sent): # this is the morphy-analyzer analyzer = Analyzer(char_subs_allowed=True) # this is the sentence-intern-index index = 0 for tok in sent: tok.index = index if tok.tag in ["NOUN", "DET", "ADJ", "PROPN", "PRON"]: appendGenderAndNumber(tok, sent, analyzer) index += 1 appendPOCs(sent) clearPOCInfos(sent)
def extract_lemmas(lang): """ Returns dictionaries of the lemmas and words in language 'lang' (with the respective features) """ if lang == 'it': words = defaultdict(list) lemmas = defaultdict(list) with open('../data/lemmatizer_unique.txt', 'r', encoding='latin-1') as f: for l in f: l = l.strip().split('\t') if len(l) == 3: atts = l[2].split(':') if len(atts) > 1: features = set(atts[1].split('+')) else: features = None pos = set(atts[0].split('-')) words[l[0]].append((l[1], pos, features)) lemmas[l[1]].append((l[0], pos, features)) if lang == 'de': analyzer = Analyzer(char_subs_allowed=True) words = defaultdict(list) lemmas = defaultdict(list) for w in vocab: try: s = analyzer.analyze(w) except: continue else: if len(s) == 0: continue for anlyss in s: features = ast.literal_eval(str(anlyss)) words[w].append((features['LEMMA'], features)) lemmas[features['LEMMA']].append((w, features)) return words, lemmas
def build_word_pairs(words, max_pairs=None): word_infos = [] for word in words: analyzer = Analyzer(char_subs_allowed=True) cache_size = 200 # you can arrange the size or unlimited cache. For German lang, we recommed 200 as cache size. cached = memoize if cache_size == "unlim" else ( lrudecorator(cache_size) if cache_size else (lambda x: x)) analyze = cached(analyzer.analyze) # The analyzer returns multiple possible forms # we take the last one because it often seems to be the most intuitive demorph_candidates = analyze(word) if len(demorph_candidates) > 0: candidate_attr = demorph_candidates[-1]._fields if candidate_attr['CATEGORY'] in CATEGORIES: # remove all information that we don't need i.e. everything but the attributes in DF_COLUMNS word_info = { key: value for key, value in candidate_attr.items() if key in DF_COLUMNS } word_info['WORD'] = word word_infos.append(word_info) df_words = pd.DataFrame(word_infos) df_words = df_words.fillna('unk') verb_groups = df_words.loc[df_words['CATEGORY'] == 'V'].groupby( ['CATEGORY', 'TENSE', 'NUMERUS', 'PERSON', 'MODE']) verb_pairs = get_pairs_from_groupby(verb_groups) adj_groups = df_words.loc[df_words['CATEGORY'] == 'ADJ'].groupby( ['CATEGORY', 'DEGREE', 'CASE', 'NUMERUS']) adj_pairs = get_pairs_from_groupby(adj_groups) noun_groups = df_words.loc[df_words['CATEGORY'] == 'NN'].groupby( ['CATEGORY', 'CASE', 'GENDER', 'NUMERUS']) noun_pairs = get_pairs_from_groupby(noun_groups) return verb_pairs, adj_pairs, noun_pairs
from mydict import PersonalDictionary DEBUG_LEVEL = "DEBUG" msg = get_logger("Deutschkurs", DEBUG_LEVEL) msg.info("Starting Deutschkurs") try: nltk.data.find('tokenizers/punkt') msg.debug("Tokenizer Punkt found!") except LookupError as error: msg.warning("Tokenizer Punkt not found!") msg.info("Downloading Tokenizer Punkt for NLTK") nltk.download('punkt') analyzer = Analyzer(char_subs_allowed=True) msg.debug("DEMorphy initialited") try: msg.debug("Loading German corpus") # ~ nlp = spacy.load("de_core_news_lg") nlp = spacy.load("de_dep_news_trf") except: msg.error("German corpus not found. Download it manually:") msg.info("python3 -m spacy download de_dep_news_trf") # python3 -m spacy download de_core_news_sm exit(-1) # Check directories existence
def lookupDEMorphy(found_verb): #look up 1person present tense and past tense of a word in the given text from demorphy import Analyzer import subprocess import os import shelve from searchVerbs import search_verbs """ def search_known_verbs(knownVerbs, Tense): if (knownVerbs[1][0]['LEMMA'] == found_verb.lemma_ and knownVerbs[1][0]['TENSE'] == Tense and knownVerbs[1][0]['PERSON'] == '1per' and knownVerbs[1][0]['NUMERUS'] == 'sing' and knownVerbs[1][0]['MODE'] == 'ind'): return True else: return False def search_unknown_verbs(foundInDEMorphy, word, Tense, known_verb_list, printString): if (word[1].lemma == found_verb.lemma_ and word[1].tense == Tense and word[1].person == '1per' and word[1].numerus == 'sing' and word[1].mode == 'ind'): print(f' {Tense}: ich ' + word[0]) #print(word) #foundInDEMorphy += 1 pres1 = 'ich ' + word[0] printString[f'{Tense}1'] = pres1 newResult =[word[0], [{'LEMMA': word[1].lemma, 'MODE': word[1].mode, 'NUMERUS': word[1].numerus, 'PERSON': word[1].person, 'TENSE': word[1].tense}]] alreadyin = False for knownVerbs in known_verb_list: #print(known_verb_list) if newResult[0] == knownVerbs[0] and word[1].mode == knownVerbs[1][0]['MODE']: print(f'{Tense}1: already in known verb list: ' + knownVerbs[0]) alreadyin = True elif alreadyin == False and knownVerbs[0] == known_verb_list[-1][0]: print(' new word saved: '+ str(newResult)) known_verb_list.append(newResult) print('append') #print('important verb list: ' + str(known_verb_list[0][0])) return known_verb_list, printString, foundInDEMorphy""" #uncomment to create new shelf file database """known_verb_list = [ ['aufbaue', [{'LEMMA': 'aufbauen', 'MODE': 'ind', 'NUMERUS': 'sing', 'PERSON': '1per', 'TENSE': 'pres'}] ], ['gebe', [{'LEMMA': 'geben', 'MODE': 'ind', 'NUMERUS': 'sing', 'PERSON': '1per', 'TENSE': 'pres'}] ] ]""" global known_verb_list newResult = [] printString = {} firstHyph = '' firstLetter = '' #create look up Terms for lookupProcess preparedFoundVerb = str(found_verb).strip(' ') firstLetter = preparedFoundVerb[0] allHyph = de_DE.syllables(preparedFoundVerb) if allHyph == []: firstHyph = preparedFoundVerb else: firstHyph = allHyph[0] lookupProcess = [found_verb.lemma_, firstHyph, 'ge', firstLetter] foundInDEMorphy = 0 knownVerbFound = ["", ""] for lookupTerm in lookupProcess: print(f"foundInDEMorphy: {foundInDEMorphy}") if foundInDEMorphy == 3: print( "will continue because foundIn DEMorphy is more than necessary" ) continue # look up already known words print('looking up known_verb_list with:\n' + "\t" * 5 + lookupTerm + "\n") for knownVerbs in known_verb_list: # 1 Person Präsens singular if search_verbs.search_known_verbs(knownVerbs, "pres", found_verb): pres1 = 'ich ' + knownVerbs[0] print('\twill be printed: Präsens: \t ich ' + knownVerbs[0]) printString['pres1'] = pres1 knownVerbFound[0] = "pres found" # 1 Person Präteritum singular if search_verbs.search_known_verbs(knownVerbs, "past", found_verb): print('\twill be printed: Präteritum: \t ich ' + knownVerbs[0]) past1 = 'ich ' + knownVerbs[0] printString['past1'] = past1 knownVerbFound[1] = "past found" if knownVerbFound[0] == "pres found" and knownVerbFound[ 1] == "past found": print(f"already found everything. continue with next verb") break #DEMorphy look up analyzer = Analyzer(char_subs_allowed=True) DEMorphy = analyzer.iter_lexicon_formatted(prefix=lookupTerm) #print(f"knownVerbFound: {knownVerbFound}") if True: print('looking up \033[31mDEMorphy\033[0m with:' + lookupTerm + "\n" * 5) for word in DEMorphy: if word == '': print("will continue word") continue #Präsens sing known_verb_list, printString, foundInDEMorphy = search_verbs.search_unknown_verbs( foundInDEMorphy=foundInDEMorphy, word=word, Tense="pres", printString=printString, known_verb_list=known_verb_list, found_verb=found_verb) #Präteritum sing known_verb_list, printString, foundInDEMorphy = search_verbs.search_unknown_verbs( foundInDEMorphy=foundInDEMorphy, word=word, Tense="past", printString=printString, known_verb_list=known_verb_list, found_verb=found_verb) #print('one moment bevore putting new words into shelffILE') #shelfFile['known_verb_list'] = known_verb_list #shelfFile.close() #print('shelfFile closed') return printString
from __future__ import absolute_import, unicode_literals, division import codecs import datetime import functools import logging import os import time import timeit import gc from demorphy import Analyzer analyzer = Analyzer(char_subs_allowed=True) logger = logging.getLogger('demorphy.bench') def measure_indiv(func, inner_iterations=1, repeats=5): gc.disable() times = [] for x in range(repeats): start = time.time() func() times.append(time.time() - start) gc.enable() return inner_iterations / min(times) def load_data(path): words = []