class EnglishUralicNLPMorphologicalRealizer(LanguageSpecificMorphologicalRealizer):
    def __init__(self):
        super().__init__("fi")

        self.case_map: Dict[str, str] = {"genitive": "GEN"}

    def realize(self, slot: Slot) -> str:
        case: Optional[str] = slot.attributes.get("case")
        if case is None:
            return slot.value

        log.debug("Realizing {} to Finnish")

        case = self.case_map.get(case.lower(), case.upper())
        log.debug("Normalized case {} to {}".format(slot.attributes.get("case"), case))

        possible_analyses = uralicApi.analyze(slot.value, "eng")
        log.debug("Identified {} possible analyses".format(len(possible_analyses)))
        if len(possible_analyses) == 0:
            log.warning(
                "No valid morphological analysis for {}, unable to realize despite case attribute".format(slot.value)
            )
            return slot.value

        analysis = possible_analyses[0][0]
        log.debug("Picked {} as the morphological analysis of {}".format(analysis, slot.value))

        analysis = "{}+{}".format(analysis, case)
        log.debug("Modified analysis to {}".format(analysis))

        modified_value = uralicApi.generate(analysis, "eng")[0][0]
        log.debug("Realized value is {}".format(modified_value))

        return modified_value
Ejemplo n.º 2
0
async def generate(ctx, arg):
    response = []
    lines = uralicApi.generate(arg, "fin")
    if len(lines) > 0:
            for line in lines:
                response.append(line[0])

            response = ('\n'.join(response))
            await ctx.send(response)
    else:
        await ctx.send('generation not possible')
Ejemplo n.º 3
0
def create_first_verse(verb_candidates, probability_distribution, lemmas,
                       verse):
    ''' Picks a verb from the candidates and then creates and outputs the first verse of the stanza '''

    draw = choice(verb_candidates, 1, p=probability_distribution)
    verb = uralicApi.generate(draw[0] + "+V+Act+Ind+Prs+Sg3", "fin")[0][0]
    verse = " ".join(verse)
    # print('verse: ' + verse)
    # verse += " " + verb
    verse = verse.replace(
        verse.split(" ")[0],
        verse.split(" ")[0] + " " + verb + " ")

    return fix_syllables(verse, False)
class FinnishUralicNLPMorphologicalRealizer(
        LanguageSpecificMorphologicalRealizer):
    def __init__(self):
        super().__init__("fi")

        self.case_map: Dict[str, str] = {
            "ssa": "Ine",
            "ssä": "Ine",
            "inessive": "Ine",
            "genitive": "Gen"
        }

    def realize(self, slot: Slot) -> str:
        case: Optional[str] = slot.attributes.get("case")
        if case is None:
            return slot.value

        log.debug("Realizing {} to Finnish")

        case = self.case_map.get(case.lower(), case.capitalize())
        log.debug("Normalized case {} to {}".format(
            slot.attributes.get("case"), case))

        possible_analyses = uralicApi.analyze(slot.value, "fin")
        log.debug("Identified {} possible analyses".format(
            len(possible_analyses)))
        if len(possible_analyses) == 0:
            log.warning(
                "No valid morphological analysis for {}, unable to realize despite case attribute"
                .format(slot.value))
            return slot.value

        analysis = possible_analyses[0][0]
        log.debug("Picked {} as the morphological analysis of {}".format(
            analysis, slot.value))

        # We only want to replace the last occurence of "Nom", as otherwise all parts of compound words, rather than
        # only the last, get transformed to genitive. This is simply wrong for, e.g. "tyvipari". Simply doing a global
        # replacement results in *"tyvenparin", rather than "tyviparin". Unfortunately, python lacks a replace() which
        # starts from the right, so we need to identify the correct instance of "Nom" with rfind() and then manually
        # fiddle with slices.
        gen_start_idx = analysis.rfind("Nom")
        analysis = analysis[:gen_start_idx] + "Gen" + analysis[
            gen_start_idx + 4:]  # 4 = 1 + len("Nom")
        log.debug("Modified analysis to {}".format(analysis))

        modified_value = uralicApi.generate(analysis, "fin")[0][0]
        log.debug("Realized value is {}".format(modified_value))

        return modified_value
Ejemplo n.º 5
0
 def test_generate(self):
     result = uralicApi.generate("äkkipikainen+A+Sg+Gen",
                                 "fin",
                                 force_local=True)
     self.assertEqual(result[0][0], 'äkkipikaisen')
Ejemplo n.º 6
0
def dictionary_entry(grouped_relation):
    '''

    :param grouped_relation: Lexeme ID with all Relation objects linked to it
    :return: A latex string to represent the relation as an entry in the dictionary
    '''

    lexeme_from_id, relations = grouped_relation
    relations = list(relations)
    lexeme_from = relations[0].lexeme_from

    dictionary_entry_text = []

    entry_content = (
        lexeme_from.lexeme,
        lexeme_from.pos,
        lexeme_from.specification,
    )
    entry_content = tuple([tex_escape(c) for c in entry_content])
    dictionary_entry_text.append("\entry{%s}{%s}{%s}" % entry_content)

    inflection_table = {
        'V': [
            'V+Ind+Prs+ConNeg', 'V+Ind+Prs+Sg3', 'V+Ind+Prt+Sg1',
            'V+Ind+Prt+Sg3'
        ],
        'N': ['N+Sg+Loc', 'N+Sg+Ill', 'N+Pl+Gen'],
        'A': ['A+Attr'],
        'Prop': [
            'N+Prop+Sg+Loc', 'N+Prop+Sem/Mal+Sg+Loc', 'N+Prop+Sem/Fem+Sg+Loc',
            'N+Prop+Sem/Plc+Sg+Loc'
        ]
    }

    contelex_inflexType_cases = {
        'V': {
            r'(YD,1)': ['V+Ind+Prs+Sg1', 'V+Ind+Prs+Sg3'],
            r'(AD,1)': ['V+Ind+Prs+Sg1', 'V+Ind+Prt+Sg1'],
            r'(ED,1)': ['V+Ind+Prs+Sg1', 'V+Ind+Prt+Sg1', 'V+Ind+Prt+Sg3'],
            r'(,[2-4])': ['V+Ind+Prs+ConNeg']
        },
        'N': {
            r'(Q[^,\n]*)(,1)': ['N+Sg+Loc', 'N+Sg+Ill', 'N+Pl+Gen'],
            r'(_[^Q,\n]*)(,1)': ['N+Sg+Loc', 'N+Sg+Ill'],
            r'(,3|[^D],2|ID,2)': ['N+Sg+Gen', 'N+Sg+IllN+Sg+Gen', 'N+Sg+Ill'],
            r'(,4|[YAE]D,2)': ['N+Sg+Loc', 'N+Sg+Ill'],
        }
    }

    translation_lemma_map = {
        'V': '+Inf',
        'N': '+N+Sg+Nom',
        'A': '+Sg+Nom',
        'Adv': ''
    }

    relations = list(
        sorted(relations,
               key=lambda r: (
                   r.relationmetadata_set.all().count() != 0,
                   r.lexeme_to.lexeme_lang,
               )))
    for r in relations:
        translation = r.lexeme_to
        translation_text = translation.lexeme
        pos = '' if translation.pos == lexeme_from.pos else translation.pos
        if translation.pos in translation_lemma_map:
            result = uralicApi.generate(
                translation.lexeme + '+' + 'Hom{}+'.format(translation.homoId)
                if translation.homoId > 0 else '' + translation.pos +
                translation_lemma_map[translation.pos],
                translation.language,
                dictionary_forms=True)
            if result:
                translation_text = result[0][0]

        # LaTeX escape the content
        inflections = []
        MP_forms = translation.miniparadigm_set.all()
        existing_MP_forms = defaultdict(list)
        for form in MP_forms:
            existing_MP_forms[form.msd].append(form.wordform)

        if translation.id not in nti_ids:  # ignore certain translations
            # custom transducer
            generated_MP_forms = defaultdict(list)
            if synthetiser:
                try:
                    queries, _ = _inflector.__generator_queries__(
                        translation.lexeme, translation.pos)
                    for i in range(len(queries)):
                        MP_form = '+'.join(queries[i].split('+')[1:])
                        try:
                            generated_MP_forms[MP_form].append(
                                synthetiser.lookup(
                                    queries[i])[0][0].split("@")[0])
                        except:
                            raise
                except:  # POS is empty or no queries
                    pass
            else:  # default (uralicNLP)
                generated_MP_forms = _inflector.generate_uralicNLP(
                    translation.language,
                    translation.lexeme,
                    translation.pos,
                    dictionary_forms=True)

            if translation.pos in inflection_table:
                inflection_forms = inflection_table[
                    translation.pos]  # default inflections

                # specific inflections based on contlex
                if translation.contlex and translation.pos in contelex_inflexType_cases:
                    for re_pattern, _inflections in contelex_inflexType_cases[
                            translation.pos].items():
                        if re.search(
                                re_pattern,
                                "{},{}".format(translation.contlex,
                                               translation.inflexType_str())):
                            inflection_forms = _inflections
                            break

                for inflection_form in inflection_forms:
                    generated_form = None
                    if inflection_form in existing_MP_forms:
                        generated_form = existing_MP_forms[inflection_form]
                    elif inflection_form in generated_MP_forms:
                        generated_form = generated_MP_forms[inflection_form]

                    if generated_form:
                        if inflection_form == 'A+Attr':
                            generated_form = [
                                "#{}".format(gf) for gf in generated_form
                            ]
                        elif inflection_form == 'V+Ind+Prs+ConNeg':
                            generated_form[0] = "ij {}".format(
                                generated_form[0])
                        inflections.extend(generated_form)

            if not inflections and translation.pos == 'N' and re.match(
                    r'[A-Z](.+)', translation.lexeme):
                for inflection_form in inflection_table['Prop']:
                    generated_results = uralicApi.generate(
                        "{}+{}".format(translation.lexeme, inflection_form),
                        translation.language)
                    generated_form = [
                        gr[0].split('@')[0] for gr in generated_results
                    ]
                    if generated_form:
                        inflections.extend(generated_form)
                        break

        source_specification = r.relationmetadata_set.values_list('text', flat=True) \
            .filter(type=SPECIFICATION, language=lexeme_from.language) \
            .order_by('text').all()
        target_specification = r.relationmetadata_set.values_list('text', flat=True) \
            .filter(type=SPECIFICATION, language=translation.language) \
            .order_by('text').all()
        source_example = r.relationexample_set.values_list('text', flat=True) \
            .filter(language=lexeme_from.language).order_by('text').all()
        target_example = r.relationexample_set.values_list('text', flat=True) \
            .filter(language=translation.language).order_by('text').all()

        content = (translation_text, translation.specification, pos,
                   ", ".join(inflections), ", ".join(source_specification),
                   ", ".join(target_specification), ", ".join(source_example),
                   ", ".join(target_example), "")
        content = tuple([tex_escape(c) for c in content])
        dictionary_entry_text.append(
            "\\translation{%s}{%s}{%s}{%s}{%s}{%s}{%s}{%s}{%s}" % content)

    return "\n".join(dictionary_entry_text)
Ejemplo n.º 7
0
def create_verb_probabilities(usr_input):
    ''' Uses the first input noun to find a verbs that are semantically similar. Outputs verb candidates and their probability distribution. '''

    lemmas = tokenize_and_lemmatize(usr_input)
    input_posses = get_pos_template(lemmas)
    # print("Input POSes: " + input_posses + "\n")

    # If both input words are noun. Other alternatives are not implemented.
    if input_posses == 'NN':
        lemma_dict = {'subject': lemmas[0], 'object': lemmas[1]}
        verse = []

        # Loop through both lemmas and inflect them depending on their syntactic role
        for lemma in lemmas:
            # print_some_input_info(lemma) # FOR DEBUGGING

            for analysis in uralicApi.analyze(lemma, "fin"):
                ms_desc = analysis[0].lstrip(analysis[0].split('+')[0])
                # print("Analysis of the lemma: " + lemma + ms_desc + "\n") # FOR DEBUGGING
                if ms_desc[1] == 'N':
                    if lemma == lemma_dict['subject']:
                        generated = uralicApi.generate(lemma + "+N+Sg+Nom",
                                                       "fin")
                    if lemma == lemma_dict['object']:
                        generated = uralicApi.generate(lemma + "+N+Sg+Gen",
                                                       "fin")

            if len(generated) > 0:
                verse.append(generated[0][0])
            else:
                print("Try with other words.")

            # If the lemma is subject, choose a verb using its word relations. There's probably a better alternative for this.
            if lemma == lemma_dict['subject']:
                word = semfi.get_word(lemma, "N", "fin")
                while True:
                    try:
                        relations = semfi.get_by_relation(word,
                                                          "dobj",
                                                          "fin",
                                                          sort=True)
                        break
                    except Exception as e:
                        print(
                            "At least one of the input words was not recognized, try with other words.\n\n"
                            + e)
                        exit()

                verbs_and_probs = []
                for relation in relations:
                    try:
                        if relation['word2']['pos'] == 'V':
                            inflected_form = uralicApi.generate(
                                relation['word2']['word'] +
                                "+V+Act+Ind+Prs+Sg3", "fin")[0][0]
                            first_syllable = finmeter.hyphenate(
                                inflected_form).split("-")[0]
                            if count_syllables(
                                    inflected_form
                            ) == 2 and not finmeter.is_short_syllable(
                                    first_syllable):
                                verbs_and_probs.append(
                                    (relation['word2']['word'],
                                     relation['word2']['frequency']))
                    except:
                        pass

                # Sort the verb by frequency (descending order) and get rid of the top 5% frequent and the half that is least frequent
                verbs_and_probs = sorted(
                    verbs_and_probs, key=lambda x: x[-1], reverse=True)[round((
                        (len(verbs_and_probs) / 100) *
                        5)):round(((len(verbs_and_probs) / 100) * 50))]
                if len(verbs_and_probs) == 0:
                    print("Try with other words.")
                    exit()

                else:
                    # Normalize the probabilities and choose the verb randomly
                    verb_candidates, probability_distribution = map(
                        list, zip(*verbs_and_probs))
                    probability_distribution = np.array(
                        np.array(probability_distribution) /
                        sum(probability_distribution))

        return verb_candidates, probability_distribution, lemmas, lemma_dict, verse
Ejemplo n.º 8
0
        else:
            pos = "A"

        if "CASE" not in args:
            args["CASE"] = "NOM"
        else:
            args["CASE"] = args["CASE"].upper()
        if "DEGREE" in args:
            degree = "+" + args["DEGREE"]
        possessive = ""
        if "POSS" in args:
            possessive = "+" + args["POSS"]
        #omorfi_query = "[WORD_ID="+word+"][POS="+pos+"][NUM="+args["NUM"]+"][CASE="+args["CASE"]+"]"
        omorfi_query = word + "+" + pos + degree + "+" + args["NUM"].title(
        ) + "+" + args["CASE"].title() + possessive + c**t
    word_form = _filter_generated(uralicApi.generate(omorfi_query, "fin"),
                                  word)
    if len(word_form) == 0:
        #Generation failed!
        if pos == "N":
            return inflect(beginning + "|" + word, "N+Prop", args)
        else:
            return beginning + backup_inflect(word, pos, args)
    else:
        return beginning + word_form[0][0]


def _filter_generated(res, lemma):
    if len(res) < 2:
        return res
    for r in res: