def extract_lemmas(lang): """ Returns dictionaries of the lemmas and words in language 'lang' (with the respective features) """ if lang == 'it': words = defaultdict(list) lemmas = defaultdict(list) with open('../data/lemmatizer_unique.txt', 'r', encoding='latin-1') as f: for l in f: l = l.strip().split('\t') if len(l) == 3: atts = l[2].split(':') if len(atts) > 1: features = set(atts[1].split('+')) else: features = None pos = set(atts[0].split('-')) words[l[0]].append((l[1], pos, features)) lemmas[l[1]].append((l[0], pos, features)) if lang == 'de': analyzer = Analyzer(char_subs_allowed=True) words = defaultdict(list) lemmas = defaultdict(list) for w in vocab: try: s = analyzer.analyze(w) except: continue else: if len(s) == 0: continue for anlyss in s: features = ast.literal_eval(str(anlyss)) words[w].append((features['LEMMA'], features)) lemmas[features['LEMMA']].append((w, features)) return words, lemmas
with open(cache_path, 'w') as fc: json.dump(cache, fc) msg.debug("Words cache saved") # Generate docs TITLE = "= %s\n\n" PROP = ":%s:\t\t%s\n" EOHMARK = "// END-OF-HEADER. DO NOT MODIFY OR DELETE THIS LINE\n\n" BODY = "_%s_ is a _%s_. Its lema is _%s_." # FIXME: Write Nouns in capital # FIXME: Check suffixes for key in cache: doc_path = os.path.join('docs', "%s.adoc" % key) with open(doc_path, 'w') as fdp: s = analyzer.analyze(key) # ~ print(key) # ~ print('='*len(key)) # ~ pp.pprint(s) # ~ print() # ~ print() fdp.write(TITLE % cache[key]['word']) fdp.write(PROP % ("Part Of Speech", cache[key]['pos'])) fdp.write(PROP % ("Lema", cache[key]['lema'])) fdp.write(PROP % ("Prefix", cache[key]['prefix'])) fdp.write(PROP % ("Suffix", cache[key]['suffix'])) fdp.write(EOHMARK) fdp.write(BODY % (cache[key]['word'], cache[key]['pos'], cache[key]['lema'])) pd.missing() msg.info("Ending Deutschkurs")