class EnglishUralicNLPMorphologicalRealizer(LanguageSpecificMorphologicalRealizer): def __init__(self): super().__init__("fi") self.case_map: Dict[str, str] = {"genitive": "GEN"} def realize(self, slot: Slot) -> str: case: Optional[str] = slot.attributes.get("case") if case is None: return slot.value log.debug("Realizing {} to Finnish") case = self.case_map.get(case.lower(), case.upper()) log.debug("Normalized case {} to {}".format(slot.attributes.get("case"), case)) possible_analyses = uralicApi.analyze(slot.value, "eng") log.debug("Identified {} possible analyses".format(len(possible_analyses))) if len(possible_analyses) == 0: log.warning( "No valid morphological analysis for {}, unable to realize despite case attribute".format(slot.value) ) return slot.value analysis = possible_analyses[0][0] log.debug("Picked {} as the morphological analysis of {}".format(analysis, slot.value)) analysis = "{}+{}".format(analysis, case) log.debug("Modified analysis to {}".format(analysis)) modified_value = uralicApi.generate(analysis, "eng")[0][0] log.debug("Realized value is {}".format(modified_value)) return modified_value
def _filter_generated(res, lemma): if len(res) < 2: return res for r in res: r_as = uralicApi.analyze(r[0], "fin", dictionary_forms=True) for r_a in r_as: r_a = r_a[0] if "+Use/Arch" not in r_a and "+Dial/" not in r_a and r_a.startswith( lemma): return [r]
def _deriv_analysis(word, words): anas = uralicApi.analyze(word, "sms") deriv_lemmas = [] for ana in anas: a = ana[0] if "+Der" in a: w = a.split("+")[0] if w is not word: w_i = get_id(w, words) deriv_lemmas.append([w_i + "_" +w, a]) return deriv_lemmas
def list_not_in_fst(): csv = open_read("/Users/mikahama/Downloads/2019-10-08T12_15_55-export.csv") csv.readline() r = [] for l in csv: w = l.split(",")[2] a = uralicApi.analyze(w, "sms") if len(a) == 0: r.append(w) out = open_write("jackille_sms.txt") out.write("\n".join(r)) out.close()
def _pos_tag(words): pos_tags = {"A": [], "Adv": [], "V": [], "N": [], "UNK": []} accepted_tags = set(pos_tags.keys()) for word in words: analysis = uralicApi.analyze(word[0], "fin", force_local=True) tag = "UNK" for analys in analysis: analys = _merge_compound_analysis(analys[0]) if word[0] == analys[0] and analys[1] in accepted_tags: tag = analys[1] break pos_tags[tag].append(word) return pos_tags
async def lemmatize(ctx, arg): response = [] lines = uralicApi.analyze(arg, "fin") if len(lines) == 0: await ctx.send('word not found.') return for line in lines: response.append(line[0]) response = ('\n'.join(response)) await ctx.send(response)
def analyze(word, lang): try: a = uralicApi.analyze(word, lang) a = map(lambda r: r[0].split('+'), a) a = list(filter(lambda r: r[0] == word, a)) if not a: return [[None]] a = list(map(lambda r: r[1:], a)) a = list(filter(lambda r: r, a)) return a except: pass return [[None]]
def get_pos_template(lemmas): ''' Return the POSses of each input word. E.g. "NN". Currently only searches for NN. ''' pos_template = "" for lemma in lemmas: candidates = [] for analysis in uralicApi.analyze(lemma, "fin"): pos = analysis[0].split('+')[1] # print(analysis) if pos == 'N': candidates.append(pos) pos_template += most_frequent(candidates) return pos_template
def add_two_syllables_in_front(verse): verse = remove_extra_material(verse) try: if len(verse.split()) > 2 and verse.split()[0][1] in vowels: prefix_thing = verse[:2] + random.choice( ['k', 't', 'p', 's']) + uralicApi.analyze( verse.split(" ")[0], "fin")[0][0].split('+')[0][-1] + " " verse = (prefix_thing + verse).capitalize() else: return verse except: pass return verse
class FinnishUralicNLPMorphologicalRealizer( LanguageSpecificMorphologicalRealizer): def __init__(self): super().__init__("fi") self.case_map: Dict[str, str] = { "ssa": "Ine", "ssä": "Ine", "inessive": "Ine", "genitive": "Gen" } def realize(self, slot: Slot) -> str: case: Optional[str] = slot.attributes.get("case") if case is None: return slot.value log.debug("Realizing {} to Finnish") case = self.case_map.get(case.lower(), case.capitalize()) log.debug("Normalized case {} to {}".format( slot.attributes.get("case"), case)) possible_analyses = uralicApi.analyze(slot.value, "fin") log.debug("Identified {} possible analyses".format( len(possible_analyses))) if len(possible_analyses) == 0: log.warning( "No valid morphological analysis for {}, unable to realize despite case attribute" .format(slot.value)) return slot.value analysis = possible_analyses[0][0] log.debug("Picked {} as the morphological analysis of {}".format( analysis, slot.value)) # We only want to replace the last occurence of "Nom", as otherwise all parts of compound words, rather than # only the last, get transformed to genitive. This is simply wrong for, e.g. "tyvipari". Simply doing a global # replacement results in *"tyvenparin", rather than "tyviparin". Unfortunately, python lacks a replace() which # starts from the right, so we need to identify the correct instance of "Nom" with rfind() and then manually # fiddle with slices. gen_start_idx = analysis.rfind("Nom") analysis = analysis[:gen_start_idx] + "Gen" + analysis[ gen_start_idx + 4:] # 4 = 1 + len("Nom") log.debug("Modified analysis to {}".format(analysis)) modified_value = uralicApi.generate(analysis, "fin")[0][0] log.debug("Realized value is {}".format(modified_value)) return modified_value
def tokenize_and_lemmatize(words): ''' Tokenize and lemmatize input. Returns a list of lemmas. ''' tokenizer = RegexpTokenizer(r'\w+') lemmas = [] for word in tokenizer.tokenize(words): for i, analysis in enumerate(uralicApi.analyze(word.lower(), "fin")): if analysis[0].split('+', 1)[1] == 'N+Sg+Nom': lemmas.append(analysis[0].split('+')[0]) if len(lemmas) < 2: print("Try with some other nouns.") exit() return lemmas
def print_unknown_words(elan_file_path, transcription_tier="orthT", language="kpv"): session_name = Path(elan_file_path).stem elan_file = pympi.Elan.Eaf(file_path=elan_file_path) transcription_tiers = elan_file.get_tier_ids_for_linguistic_type( transcription_tier) missed_annotations = [] for transcription_tier in transcription_tiers: annotation_values = elan_file.get_annotation_data_for_tier( transcription_tier) for annotation_value in annotation_values: text_content = annotation_value[2] text_content = re.sub( "…", ".", text_content) # It seems word_tokenize doesn't handle "…" text_content = re.sub("\[\[unclear\]\]", "", text_content) words = word_tokenize(text_content) for word in words: analysis = uralicApi.analyze(word, language) if not analysis: missed_annotations.append(word) for count, elem in sorted( ((missed_annotations.count(e), e) for e in set(missed_annotations)), reverse=True): print('%s (%d)' % (elem, count))
def test_analysis(self): result = uralicApi.analyze("äkkipikainen", "fin", force_local=True) self.assertEqual(result[0][0], 'äkkipikainen+A+Sg+Nom')
from uralicNLP import uralicApi from mikatools import * def get_lemmas(): csv = open_read("2020-05-15T18_02_07-export.csv") csv.readline() lemmas = [x.split(",")[2] for x in csv] return lemmas adjective_map = {} ambiguous_adj_map = {} for lemma in get_lemmas(): morphs = uralicApi.analyze(lemma, "sms") lemmas = [] for morph in morphs: morph = morph[0] if "+A+Attr" in morph: l = morph.split("+")[0] if l != lemma: lemmas.append(l) lemmas = list(set(lemmas)) if len(lemmas) == 1: adjective_map[lemma] = lemmas[0] elif len(lemmas) > 1: ambiguous_adj_map[lemma] = lemmas json_dump(adjective_map, "A_attr_to_A.json") json_dump(ambiguous_adj_map, "A_attr_to_A_for_Jack.json")
else: return ("_") # Here we read the XML file tree = ET.parse(input_xml) root = tree.getroot() # Now we loop over each sentence for sentence in root.findall('p/sentence'): annotated_text = "\n" for line in sentence.text.splitlines(): if line: line_content = line.split("\t") # print(line_content) # Uncommenting this is useful in checking where the script goes analysis = uralicApi.analyze(line_content[1], "kpv") line_text = line_content[0] + "\t" + line_content[ 1] + "\t" + line_content[2] + "\t" + get_lonely_lemmas( analysis) + "\t" + get_agreed_tags(analysis) + "\n" annotated_text += line_text sentence.text = annotated_text # In the end we write the new XML file tree.write(output_xml, encoding="UTF-8")
def create_verb_probabilities(usr_input): ''' Uses the first input noun to find a verbs that are semantically similar. Outputs verb candidates and their probability distribution. ''' lemmas = tokenize_and_lemmatize(usr_input) input_posses = get_pos_template(lemmas) # print("Input POSes: " + input_posses + "\n") # If both input words are noun. Other alternatives are not implemented. if input_posses == 'NN': lemma_dict = {'subject': lemmas[0], 'object': lemmas[1]} verse = [] # Loop through both lemmas and inflect them depending on their syntactic role for lemma in lemmas: # print_some_input_info(lemma) # FOR DEBUGGING for analysis in uralicApi.analyze(lemma, "fin"): ms_desc = analysis[0].lstrip(analysis[0].split('+')[0]) # print("Analysis of the lemma: " + lemma + ms_desc + "\n") # FOR DEBUGGING if ms_desc[1] == 'N': if lemma == lemma_dict['subject']: generated = uralicApi.generate(lemma + "+N+Sg+Nom", "fin") if lemma == lemma_dict['object']: generated = uralicApi.generate(lemma + "+N+Sg+Gen", "fin") if len(generated) > 0: verse.append(generated[0][0]) else: print("Try with other words.") # If the lemma is subject, choose a verb using its word relations. There's probably a better alternative for this. if lemma == lemma_dict['subject']: word = semfi.get_word(lemma, "N", "fin") while True: try: relations = semfi.get_by_relation(word, "dobj", "fin", sort=True) break except Exception as e: print( "At least one of the input words was not recognized, try with other words.\n\n" + e) exit() verbs_and_probs = [] for relation in relations: try: if relation['word2']['pos'] == 'V': inflected_form = uralicApi.generate( relation['word2']['word'] + "+V+Act+Ind+Prs+Sg3", "fin")[0][0] first_syllable = finmeter.hyphenate( inflected_form).split("-")[0] if count_syllables( inflected_form ) == 2 and not finmeter.is_short_syllable( first_syllable): verbs_and_probs.append( (relation['word2']['word'], relation['word2']['frequency'])) except: pass # Sort the verb by frequency (descending order) and get rid of the top 5% frequent and the half that is least frequent verbs_and_probs = sorted( verbs_and_probs, key=lambda x: x[-1], reverse=True)[round(( (len(verbs_and_probs) / 100) * 5)):round(((len(verbs_and_probs) / 100) * 50))] if len(verbs_and_probs) == 0: print("Try with other words.") exit() else: # Normalize the probabilities and choose the verb randomly verb_candidates, probability_distribution = map( list, zip(*verbs_and_probs)) probability_distribution = np.array( np.array(probability_distribution) / sum(probability_distribution)) return verb_candidates, probability_distribution, lemmas, lemma_dict, verse
def analyze_token(self, token: SentenceToken, **kwargs) -> SentenceToken: morphologies = [a[0] for a in uralicApi.analyze(token.text, self.lang)] return token.with_morphologies(morphologies, 'uralic')