class EnglishUralicNLPMorphologicalRealizer(LanguageSpecificMorphologicalRealizer):
    def __init__(self):
        super().__init__("fi")

        self.case_map: Dict[str, str] = {"genitive": "GEN"}

    def realize(self, slot: Slot) -> str:
        case: Optional[str] = slot.attributes.get("case")
        if case is None:
            return slot.value

        log.debug("Realizing {} to Finnish")

        case = self.case_map.get(case.lower(), case.upper())
        log.debug("Normalized case {} to {}".format(slot.attributes.get("case"), case))

        possible_analyses = uralicApi.analyze(slot.value, "eng")
        log.debug("Identified {} possible analyses".format(len(possible_analyses)))
        if len(possible_analyses) == 0:
            log.warning(
                "No valid morphological analysis for {}, unable to realize despite case attribute".format(slot.value)
            )
            return slot.value

        analysis = possible_analyses[0][0]
        log.debug("Picked {} as the morphological analysis of {}".format(analysis, slot.value))

        analysis = "{}+{}".format(analysis, case)
        log.debug("Modified analysis to {}".format(analysis))

        modified_value = uralicApi.generate(analysis, "eng")[0][0]
        log.debug("Realized value is {}".format(modified_value))

        return modified_value
Example #2
0
def _filter_generated(res, lemma):
    if len(res) < 2:
        return res
    for r in res:
        r_as = uralicApi.analyze(r[0], "fin", dictionary_forms=True)
        for r_a in r_as:
            r_a = r_a[0]
            if "+Use/Arch" not in r_a and "+Dial/" not in r_a and r_a.startswith(
                    lemma):
                return [r]
Example #3
0
def _deriv_analysis(word, words):
	anas = uralicApi.analyze(word, "sms")
	deriv_lemmas = []
	for ana in anas:
		a = ana[0]
		if "+Der" in a:
			w = a.split("+")[0]
			if w is not word:
				w_i = get_id(w, words)
				deriv_lemmas.append([w_i + "_" +w, a])
	return deriv_lemmas
Example #4
0
def list_not_in_fst():
	csv = open_read("/Users/mikahama/Downloads/2019-10-08T12_15_55-export.csv")
	csv.readline()
	r = []
	for l in csv:
		w = l.split(",")[2]
		a = uralicApi.analyze(w, "sms")
		if len(a) == 0:
			r.append(w)
	out = open_write("jackille_sms.txt")
	out.write("\n".join(r))
	out.close()
Example #5
0
def _pos_tag(words):
    pos_tags = {"A": [], "Adv": [], "V": [], "N": [], "UNK": []}
    accepted_tags = set(pos_tags.keys())
    for word in words:
        analysis = uralicApi.analyze(word[0], "fin", force_local=True)
        tag = "UNK"
        for analys in analysis:
            analys = _merge_compound_analysis(analys[0])
            if word[0] == analys[0] and analys[1] in accepted_tags:
                tag = analys[1]
                break
        pos_tags[tag].append(word)
    return pos_tags
Example #6
0
async def lemmatize(ctx, arg):
    response = []
    lines = uralicApi.analyze(arg, "fin")

    if len(lines) == 0:
        await ctx.send('word not found.')
        return

    for line in lines:
        response.append(line[0]) 

    response = ('\n'.join(response))
    await ctx.send(response)
Example #7
0
def analyze(word, lang):
    try:
        a = uralicApi.analyze(word, lang)
        a = map(lambda r: r[0].split('+'), a)
        a = list(filter(lambda r: r[0] == word, a))
        if not a:
            return [[None]]
        a = list(map(lambda r: r[1:], a))
        a = list(filter(lambda r: r, a))
        return a
    except:
        pass
    return [[None]]
Example #8
0
def get_pos_template(lemmas):
    ''' Return the POSses of each input word. E.g. "NN". Currently only searches for NN. '''

    pos_template = ""
    for lemma in lemmas:
        candidates = []
        for analysis in uralicApi.analyze(lemma, "fin"):
            pos = analysis[0].split('+')[1]
            # print(analysis)
            if pos == 'N':
                candidates.append(pos)
        pos_template += most_frequent(candidates)

    return pos_template
Example #9
0
def add_two_syllables_in_front(verse):
    verse = remove_extra_material(verse)
    try:
        if len(verse.split()) > 2 and verse.split()[0][1] in vowels:
            prefix_thing = verse[:2] + random.choice(
                ['k', 't', 'p', 's']) + uralicApi.analyze(
                    verse.split(" ")[0], "fin")[0][0].split('+')[0][-1] + " "
            verse = (prefix_thing + verse).capitalize()
        else:
            return verse
    except:
        pass

    return verse
class FinnishUralicNLPMorphologicalRealizer(
        LanguageSpecificMorphologicalRealizer):
    def __init__(self):
        super().__init__("fi")

        self.case_map: Dict[str, str] = {
            "ssa": "Ine",
            "ssä": "Ine",
            "inessive": "Ine",
            "genitive": "Gen"
        }

    def realize(self, slot: Slot) -> str:
        case: Optional[str] = slot.attributes.get("case")
        if case is None:
            return slot.value

        log.debug("Realizing {} to Finnish")

        case = self.case_map.get(case.lower(), case.capitalize())
        log.debug("Normalized case {} to {}".format(
            slot.attributes.get("case"), case))

        possible_analyses = uralicApi.analyze(slot.value, "fin")
        log.debug("Identified {} possible analyses".format(
            len(possible_analyses)))
        if len(possible_analyses) == 0:
            log.warning(
                "No valid morphological analysis for {}, unable to realize despite case attribute"
                .format(slot.value))
            return slot.value

        analysis = possible_analyses[0][0]
        log.debug("Picked {} as the morphological analysis of {}".format(
            analysis, slot.value))

        # We only want to replace the last occurence of "Nom", as otherwise all parts of compound words, rather than
        # only the last, get transformed to genitive. This is simply wrong for, e.g. "tyvipari". Simply doing a global
        # replacement results in *"tyvenparin", rather than "tyviparin". Unfortunately, python lacks a replace() which
        # starts from the right, so we need to identify the correct instance of "Nom" with rfind() and then manually
        # fiddle with slices.
        gen_start_idx = analysis.rfind("Nom")
        analysis = analysis[:gen_start_idx] + "Gen" + analysis[
            gen_start_idx + 4:]  # 4 = 1 + len("Nom")
        log.debug("Modified analysis to {}".format(analysis))

        modified_value = uralicApi.generate(analysis, "fin")[0][0]
        log.debug("Realized value is {}".format(modified_value))

        return modified_value
Example #11
0
def tokenize_and_lemmatize(words):
    ''' Tokenize and lemmatize input. Returns a list of lemmas. '''

    tokenizer = RegexpTokenizer(r'\w+')
    lemmas = []

    for word in tokenizer.tokenize(words):
        for i, analysis in enumerate(uralicApi.analyze(word.lower(), "fin")):
            if analysis[0].split('+', 1)[1] == 'N+Sg+Nom':
                lemmas.append(analysis[0].split('+')[0])

    if len(lemmas) < 2:
        print("Try with some other nouns.")
        exit()

    return lemmas
Example #12
0
def print_unknown_words(elan_file_path,
                        transcription_tier="orthT",
                        language="kpv"):

    session_name = Path(elan_file_path).stem

    elan_file = pympi.Elan.Eaf(file_path=elan_file_path)

    transcription_tiers = elan_file.get_tier_ids_for_linguistic_type(
        transcription_tier)

    missed_annotations = []

    for transcription_tier in transcription_tiers:

        annotation_values = elan_file.get_annotation_data_for_tier(
            transcription_tier)

        for annotation_value in annotation_values:

            text_content = annotation_value[2]
            text_content = re.sub(
                "…", ".",
                text_content)  # It seems word_tokenize doesn't handle "…"
            text_content = re.sub("\[\[unclear\]\]", "", text_content)

            words = word_tokenize(text_content)

            for word in words:

                analysis = uralicApi.analyze(word, language)
                if not analysis:
                    missed_annotations.append(word)

    for count, elem in sorted(
        ((missed_annotations.count(e), e) for e in set(missed_annotations)),
            reverse=True):
        print('%s (%d)' % (elem, count))
Example #13
0
 def test_analysis(self):
     result = uralicApi.analyze("äkkipikainen", "fin", force_local=True)
     self.assertEqual(result[0][0], 'äkkipikainen+A+Sg+Nom')
Example #14
0
from uralicNLP import uralicApi
from mikatools import *


def get_lemmas():
    csv = open_read("2020-05-15T18_02_07-export.csv")
    csv.readline()
    lemmas = [x.split(",")[2] for x in csv]
    return lemmas


adjective_map = {}
ambiguous_adj_map = {}

for lemma in get_lemmas():
    morphs = uralicApi.analyze(lemma, "sms")
    lemmas = []
    for morph in morphs:
        morph = morph[0]
        if "+A+Attr" in morph:
            l = morph.split("+")[0]
            if l != lemma:
                lemmas.append(l)
    lemmas = list(set(lemmas))
    if len(lemmas) == 1:
        adjective_map[lemma] = lemmas[0]
    elif len(lemmas) > 1:
        ambiguous_adj_map[lemma] = lemmas

json_dump(adjective_map, "A_attr_to_A.json")
json_dump(ambiguous_adj_map, "A_attr_to_A_for_Jack.json")
Example #15
0
    else:
        return ("_")


# Here we read the XML file

tree = ET.parse(input_xml)
root = tree.getroot()

# Now we loop over each sentence

for sentence in root.findall('p/sentence'):

    annotated_text = "\n"

    for line in sentence.text.splitlines():

        if line:
            line_content = line.split("\t")
            # print(line_content) # Uncommenting this is useful in checking where the script goes
            analysis = uralicApi.analyze(line_content[1], "kpv")
            line_text = line_content[0] + "\t" + line_content[
                1] + "\t" + line_content[2] + "\t" + get_lonely_lemmas(
                    analysis) + "\t" + get_agreed_tags(analysis) + "\n"
            annotated_text += line_text
    sentence.text = annotated_text

# In the end we write the new XML file

tree.write(output_xml, encoding="UTF-8")
Example #16
0
def create_verb_probabilities(usr_input):
    ''' Uses the first input noun to find a verbs that are semantically similar. Outputs verb candidates and their probability distribution. '''

    lemmas = tokenize_and_lemmatize(usr_input)
    input_posses = get_pos_template(lemmas)
    # print("Input POSes: " + input_posses + "\n")

    # If both input words are noun. Other alternatives are not implemented.
    if input_posses == 'NN':
        lemma_dict = {'subject': lemmas[0], 'object': lemmas[1]}
        verse = []

        # Loop through both lemmas and inflect them depending on their syntactic role
        for lemma in lemmas:
            # print_some_input_info(lemma) # FOR DEBUGGING

            for analysis in uralicApi.analyze(lemma, "fin"):
                ms_desc = analysis[0].lstrip(analysis[0].split('+')[0])
                # print("Analysis of the lemma: " + lemma + ms_desc + "\n") # FOR DEBUGGING
                if ms_desc[1] == 'N':
                    if lemma == lemma_dict['subject']:
                        generated = uralicApi.generate(lemma + "+N+Sg+Nom",
                                                       "fin")
                    if lemma == lemma_dict['object']:
                        generated = uralicApi.generate(lemma + "+N+Sg+Gen",
                                                       "fin")

            if len(generated) > 0:
                verse.append(generated[0][0])
            else:
                print("Try with other words.")

            # If the lemma is subject, choose a verb using its word relations. There's probably a better alternative for this.
            if lemma == lemma_dict['subject']:
                word = semfi.get_word(lemma, "N", "fin")
                while True:
                    try:
                        relations = semfi.get_by_relation(word,
                                                          "dobj",
                                                          "fin",
                                                          sort=True)
                        break
                    except Exception as e:
                        print(
                            "At least one of the input words was not recognized, try with other words.\n\n"
                            + e)
                        exit()

                verbs_and_probs = []
                for relation in relations:
                    try:
                        if relation['word2']['pos'] == 'V':
                            inflected_form = uralicApi.generate(
                                relation['word2']['word'] +
                                "+V+Act+Ind+Prs+Sg3", "fin")[0][0]
                            first_syllable = finmeter.hyphenate(
                                inflected_form).split("-")[0]
                            if count_syllables(
                                    inflected_form
                            ) == 2 and not finmeter.is_short_syllable(
                                    first_syllable):
                                verbs_and_probs.append(
                                    (relation['word2']['word'],
                                     relation['word2']['frequency']))
                    except:
                        pass

                # Sort the verb by frequency (descending order) and get rid of the top 5% frequent and the half that is least frequent
                verbs_and_probs = sorted(
                    verbs_and_probs, key=lambda x: x[-1], reverse=True)[round((
                        (len(verbs_and_probs) / 100) *
                        5)):round(((len(verbs_and_probs) / 100) * 50))]
                if len(verbs_and_probs) == 0:
                    print("Try with other words.")
                    exit()

                else:
                    # Normalize the probabilities and choose the verb randomly
                    verb_candidates, probability_distribution = map(
                        list, zip(*verbs_and_probs))
                    probability_distribution = np.array(
                        np.array(probability_distribution) /
                        sum(probability_distribution))

        return verb_candidates, probability_distribution, lemmas, lemma_dict, verse
Example #17
0
 def analyze_token(self, token: SentenceToken, **kwargs) -> SentenceToken:
     morphologies = [a[0] for a in uralicApi.analyze(token.text, self.lang)]
     return token.with_morphologies(morphologies, 'uralic')