Exemple #1
0
def main(cache_size):
    """
    Sample usage
    We use cache for recurring words such as pronouns, conjunctions, common verbs, modular and auxiliary verbs.
    """
    analyzer = Analyzer(char_subs_allowed=True)
    cached = memoize if cache_size == "unlim" else (
        lrudecorator(cache_size) if cache_size else (lambda x: x))
    analyze = cached(analyzer.analyze)
    x = analyzer.iter_lexicon_formatted(u"ge")
    for i in x:
        print(i)
def expandToken(sent):
    # this is the morphy-analyzer
    analyzer = Analyzer(char_subs_allowed=True)
    # this is the sentence-intern-index
    index = 0
    for tok in sent:
        tok.index = index
        if tok.tag in ["NOUN", "DET", "ADJ", "PROPN", "PRON"]:
            appendGenderAndNumber(tok, sent, analyzer)
        index += 1
    appendPOCs(sent)
    clearPOCInfos(sent)
Exemple #3
0
def extract_lemmas(lang):
    """
    Returns dictionaries of the lemmas and words in language 'lang' (with the respective features)
    """
    if lang == 'it':
        words = defaultdict(list)
        lemmas = defaultdict(list)
        with open('../data/lemmatizer_unique.txt', 'r',
                  encoding='latin-1') as f:
            for l in f:
                l = l.strip().split('\t')
                if len(l) == 3:
                    atts = l[2].split(':')
                    if len(atts) > 1:
                        features = set(atts[1].split('+'))
                    else:
                        features = None
                    pos = set(atts[0].split('-'))
                    words[l[0]].append((l[1], pos, features))
                    lemmas[l[1]].append((l[0], pos, features))

    if lang == 'de':
        analyzer = Analyzer(char_subs_allowed=True)

        words = defaultdict(list)
        lemmas = defaultdict(list)
        for w in vocab:
            try:
                s = analyzer.analyze(w)
            except:
                continue
            else:
                if len(s) == 0:
                    continue
                for anlyss in s:
                    features = ast.literal_eval(str(anlyss))
                    words[w].append((features['LEMMA'], features))
                    lemmas[features['LEMMA']].append((w, features))

    return words, lemmas
def build_word_pairs(words, max_pairs=None):
    word_infos = []
    for word in words:
        analyzer = Analyzer(char_subs_allowed=True)
        cache_size = 200  # you can arrange the size or unlimited cache. For German lang, we recommed 200 as cache size.
        cached = memoize if cache_size == "unlim" else (
            lrudecorator(cache_size) if cache_size else (lambda x: x))
        analyze = cached(analyzer.analyze)
        # The analyzer returns multiple possible forms
        # we take the last one because it often seems to be the most intuitive
        demorph_candidates = analyze(word)
        if len(demorph_candidates) > 0:
            candidate_attr = demorph_candidates[-1]._fields
            if candidate_attr['CATEGORY'] in CATEGORIES:
                # remove all information that we don't need i.e. everything but the attributes in DF_COLUMNS
                word_info = {
                    key: value
                    for key, value in candidate_attr.items()
                    if key in DF_COLUMNS
                }
                word_info['WORD'] = word
                word_infos.append(word_info)

    df_words = pd.DataFrame(word_infos)
    df_words = df_words.fillna('unk')
    verb_groups = df_words.loc[df_words['CATEGORY'] == 'V'].groupby(
        ['CATEGORY', 'TENSE', 'NUMERUS', 'PERSON', 'MODE'])
    verb_pairs = get_pairs_from_groupby(verb_groups)

    adj_groups = df_words.loc[df_words['CATEGORY'] == 'ADJ'].groupby(
        ['CATEGORY', 'DEGREE', 'CASE', 'NUMERUS'])
    adj_pairs = get_pairs_from_groupby(adj_groups)

    noun_groups = df_words.loc[df_words['CATEGORY'] == 'NN'].groupby(
        ['CATEGORY', 'CASE', 'GENDER', 'NUMERUS'])
    noun_pairs = get_pairs_from_groupby(noun_groups)
    return verb_pairs, adj_pairs, noun_pairs
Exemple #5
0
from mydict import PersonalDictionary

DEBUG_LEVEL = "DEBUG"

msg = get_logger("Deutschkurs", DEBUG_LEVEL)
msg.info("Starting Deutschkurs")

try:
    nltk.data.find('tokenizers/punkt')
    msg.debug("Tokenizer Punkt found!")
except LookupError as error:
    msg.warning("Tokenizer Punkt not found!")
    msg.info("Downloading Tokenizer Punkt for NLTK")
    nltk.download('punkt')

analyzer = Analyzer(char_subs_allowed=True)
msg.debug("DEMorphy initialited")

try:
    msg.debug("Loading German corpus")
    # ~ nlp = spacy.load("de_core_news_lg")
    nlp = spacy.load("de_dep_news_trf")
except:
    msg.error("German corpus not found. Download it manually:")
    msg.info("python3 -m spacy download de_dep_news_trf")
    # python3 -m spacy download de_core_news_sm
    exit(-1)


# Check directories existence
def lookupDEMorphy(found_verb):
    #look up 1person present tense and past tense of a word in the given text

    from demorphy import Analyzer
    import subprocess
    import os
    import shelve
    from searchVerbs import search_verbs
    """
	def search_known_verbs(knownVerbs, Tense):

		if (knownVerbs[1][0]['LEMMA'] == found_verb.lemma_
			and knownVerbs[1][0]['TENSE'] == Tense
			and knownVerbs[1][0]['PERSON'] == '1per'
			and knownVerbs[1][0]['NUMERUS'] == 'sing'
			and knownVerbs[1][0]['MODE'] == 'ind'):
			return True
		else:
			return False

	def search_unknown_verbs(foundInDEMorphy, word, Tense, known_verb_list, printString):
		if (word[1].lemma == found_verb.lemma_
			and word[1].tense == Tense
			and word[1].person == '1per'
			and word[1].numerus == 'sing'
			and word[1].mode == 'ind'):

			print(f'		{Tense}: ich ' + word[0])
			#print(word)
			#foundInDEMorphy += 1
			pres1 = 'ich ' + word[0]
			printString[f'{Tense}1'] = pres1

			newResult =[word[0], [{'LEMMA': word[1].lemma,
										'MODE': word[1].mode,
										'NUMERUS': word[1].numerus,
										'PERSON': word[1].person,
										'TENSE': word[1].tense}]]
			alreadyin = False
			for knownVerbs in known_verb_list:
				#print(known_verb_list)
				if newResult[0] == knownVerbs[0] and word[1].mode == knownVerbs[1][0]['MODE']:
					print(f'{Tense}1: already in known verb list: ' + knownVerbs[0])
					alreadyin = True
				elif alreadyin == False and knownVerbs[0] == known_verb_list[-1][0]:
					print('					new word saved: '+ str(newResult))
					known_verb_list.append(newResult)
					print('append')
					#print('important verb list: ' + str(known_verb_list[0][0]))
		return known_verb_list, printString, foundInDEMorphy"""

    #uncomment to create new shelf file database
    """known_verb_list = [
								['aufbaue',
									[{'LEMMA': 'aufbauen',
									 'MODE': 'ind',
									 'NUMERUS': 'sing',
									 'PERSON': '1per',
									 'TENSE': 'pres'}]
								],
								['gebe',
									[{'LEMMA': 'geben',
									 'MODE': 'ind',
									 'NUMERUS': 'sing',
									 'PERSON': '1per',
									 'TENSE': 'pres'}]
								]
							]"""
    global known_verb_list
    newResult = []
    printString = {}
    firstHyph = ''
    firstLetter = ''

    #create look up Terms for lookupProcess
    preparedFoundVerb = str(found_verb).strip(' ')
    firstLetter = preparedFoundVerb[0]
    allHyph = de_DE.syllables(preparedFoundVerb)
    if allHyph == []:
        firstHyph = preparedFoundVerb
    else:
        firstHyph = allHyph[0]
    lookupProcess = [found_verb.lemma_, firstHyph, 'ge', firstLetter]

    foundInDEMorphy = 0
    knownVerbFound = ["", ""]
    for lookupTerm in lookupProcess:
        print(f"foundInDEMorphy: {foundInDEMorphy}")
        if foundInDEMorphy == 3:
            print(
                "will continue because foundIn DEMorphy is more than necessary"
            )
            continue

    # look up already known words
        print('looking up known_verb_list with:\n' + "\t" * 5 + lookupTerm +
              "\n")
        for knownVerbs in known_verb_list:

            # 1 Person Präsens singular
            if search_verbs.search_known_verbs(knownVerbs, "pres", found_verb):
                pres1 = 'ich ' + knownVerbs[0]
                print('\twill be printed: Präsens: \t ich ' + knownVerbs[0])
                printString['pres1'] = pres1
                knownVerbFound[0] = "pres found"

            # 1 Person Präteritum singular
            if search_verbs.search_known_verbs(knownVerbs, "past", found_verb):
                print('\twill be printed: Präteritum: \t ich ' + knownVerbs[0])
                past1 = 'ich ' + knownVerbs[0]
                printString['past1'] = past1
                knownVerbFound[1] = "past found"

        if knownVerbFound[0] == "pres found" and knownVerbFound[
                1] == "past found":
            print(f"already found everything. continue with next verb")
            break

        #DEMorphy look up
        analyzer = Analyzer(char_subs_allowed=True)
        DEMorphy = analyzer.iter_lexicon_formatted(prefix=lookupTerm)
        #print(f"knownVerbFound: {knownVerbFound}")
        if True:
            print('looking up \033[31mDEMorphy\033[0m with:' + lookupTerm +
                  "\n" * 5)
            for word in DEMorphy:
                if word == '':
                    print("will continue word")
                    continue
                #Präsens sing

                known_verb_list, printString, foundInDEMorphy = search_verbs.search_unknown_verbs(
                    foundInDEMorphy=foundInDEMorphy,
                    word=word,
                    Tense="pres",
                    printString=printString,
                    known_verb_list=known_verb_list,
                    found_verb=found_verb)

                #Präteritum sing
                known_verb_list, printString, foundInDEMorphy = search_verbs.search_unknown_verbs(
                    foundInDEMorphy=foundInDEMorphy,
                    word=word,
                    Tense="past",
                    printString=printString,
                    known_verb_list=known_verb_list,
                    found_verb=found_verb)

    #print('one moment bevore putting new words into shelffILE')
    #shelfFile['known_verb_list'] = known_verb_list
    #shelfFile.close()
    #print('shelfFile closed')
    return printString
Exemple #7
0
from __future__ import absolute_import, unicode_literals, division
import codecs
import datetime
import functools
import logging
import os

import time
import timeit
import gc

from demorphy import Analyzer
analyzer = Analyzer(char_subs_allowed=True)

logger = logging.getLogger('demorphy.bench')


def measure_indiv(func, inner_iterations=1, repeats=5):
    gc.disable()
    times = []
    for x in range(repeats):
        start = time.time()
        func()
        times.append(time.time() - start)

    gc.enable()
    return inner_iterations / min(times)


def load_data(path):
    words = []