Exemple #1
0
    def __init__(
        self,
        result: types.Result,
        *,
        search_run: core.SearchRun,
        display_mode="community",
        animate_emoji=AnimateEmoji.default,
        show_emoji=ShowEmoji.default,
        dict_source=None,
    ):
        self._result = result
        self._search_run = search_run
        self._relabeller = {
            "english": read_labels().english,
            "linguistic": read_labels().linguistic_long,
            "source_language": read_labels().source_language,
        }.get(display_mode, DisplayMode.default)
        self._animate_emoji = animate_emoji
        self._show_emoji = show_emoji

        self.wordform = result.wordform
        self.lemma_wordform = result.lemma_wordform
        self.is_lemma = result.is_lemma
        self.source_language_match = result.source_language_match
        self.dict_source = dict_source

        if settings.MORPHODICT_TAG_STYLE == "Plus":
            (
                self.linguistic_breakdown_head,
                _,
                self.linguistic_breakdown_tail,
            ) = result.wordform.analysis or [[], None, []]
        elif settings.MORPHODICT_TAG_STYLE == "Bracket":
            # Arapaho has some head tags that the Plus-style FSTs put at the
            # tail. For now, move them; later on elaboration could be a
            # language-specific function.
            head, _, tail = result.wordform.analysis or [[], None, []]

            new_head = []
            new_tail_prefix = []
            for i, tag in enumerate(head):
                if tag.startswith("["):
                    new_tail_prefix.append(tag)
                else:
                    new_head.append(tag)
            self.linguistic_breakdown_head = new_head
            self.linguistic_breakdown_tail = new_tail_prefix + list(tail)
        else:
            raise Exception(f"Unknown {settings.MORPHODICT_TAG_STYLE=}")

        self.lexical_info = get_lexical_info(result.wordform.analysis,
                                             animate_emoji, self._show_emoji,
                                             self.dict_source)

        if rich_analysis := result.wordform.analysis:
            self.morphemes = rich_analysis.generate_with_morphemes(
                result.wordform.text)
Exemple #2
0
def relabel_linguistic_long(pos: str):
    """
    Should take in a class and return the plain english labelling for it
    So if I pass in "VTA-1", I should get back:
    transitive animate verb – class 1: regular
    """
    return read_labels().linguistic_long.get(pos)
Exemple #3
0
def relabel_linguistic_short(pos: str):
    """
    Should take in a class and return the plain english labelling for it
    So if I pass in "VTA-1", I should get back:
    VTA-1
    """
    return read_labels().linguistic_short.get(pos)
Exemple #4
0
def relabel_source(pos: str):
    """
    Should take in a class and return the plain english labelling for it
    So if I pass in "VTA-1", I should get back:
    tâpiskôc: wîcihêw
    """
    return read_labels().source_language.get(pos)
Exemple #5
0
def relabel_plain_english(pos: str):
    """
    Should take in a class and return the plain english labelling for it
    So if I pass in "VTA-1", I should get back:
    like: wîcihêw
    """
    return read_labels().english.get(pos)
def get_emoji_for_cree_wordclass(
        word_class: Optional[str],
        animate_emoji: str = AnimateEmoji.default) -> Optional[str]:
    """
    Attempts to get an emoji description of the full wordclass.
    e.g., "👤👵🏽" for "nôhkom"
    """
    if word_class is None:
        return None

    def to_fst_output_style(value):
        if value[0] == "N":
            return list(value.upper())
        elif value[0] == "V":
            return ["V", value[1:].upper()]
        else:
            return [value.title()]

    tags = to_fst_output_style(word_class)
    original = read_labels().emoji.get_longest(tags)

    ret = original
    if original:
        ret = use_preferred_animate_emoji(original, animate_emoji)
    return ret
Exemple #7
0
def label_setting_to_relabeller(label_setting: str):
    labels = read_labels()

    return {
        "english": labels.english,
        "linguistic": labels.linguistic_short,
        "source_language": labels.source_language,
    }.get(label_setting, labels.english)
def serialize_wordform(wordform: Wordform, animate_emoji: str,
                       dict_source: list) -> SerializedWordform:
    """
    Intended to be passed in a JSON API or into templates.

    :return: json parsable result
    """
    result = model_to_dict(wordform)
    result["definitions"] = serialize_definitions(wordform.definitions.all(),
                                                  dict_source=dict_source)
    result["lemma_url"] = wordform.get_absolute_url()

    if wordform.linguist_info:
        if inflectional_category := wordform.linguist_info.get(
                "inflectional_category", None):
            result.update({
                "inflectional_category_plain_english":
                read_labels().english.get(inflectional_category),
                "inflectional_category_linguistic":
                read_labels().linguistic_long.get(inflectional_category),
            })
        if wordclass := wordform.linguist_info.get("wordclass"):
            result["wordclass_emoji"] = get_emoji_for_cree_wordclass(
                wordclass, animate_emoji)
    def __init__(
        self,
        result: types.Result,
        *,
        search_run: core.SearchRun,
        display_mode="community",
        animate_emoji=AnimateEmoji.default,
        dict_source=None,
    ):
        self._result = result
        self._search_run = search_run
        self._relabeller = {
            "community": read_labels().english,
            "linguistic": read_labels().linguistic_long,
        }.get(display_mode, DisplayMode.default)
        self._animate_emoji = animate_emoji

        self.wordform = result.wordform
        self.lemma_wordform = result.lemma_wordform
        self.is_lemma = result.is_lemma
        self.source_language_match = result.source_language_match
        self.dict_source = dict_source

        if settings.MORPHODICT_TAG_STYLE == "Plus":
            (
                self.linguistic_breakdown_head,
                _,
                self.linguistic_breakdown_tail,
            ) = result.wordform.analysis or [[], None, []]
        elif settings.MORPHODICT_TAG_STYLE == "Bracket":
            # Arapaho has some head tags that the Plus-style FSTs put at the
            # tail. For now, move them; later on elaboration could be a
            # language-specific function.
            head, _, tail = result.wordform.analysis or [[], None, []]

            new_head = []
            new_tail_prefix = []
            for i, tag in enumerate(head):
                if tag.startswith("["):
                    new_tail_prefix.append(tag)
                else:
                    new_head.append(tag)
            self.linguistic_breakdown_head = new_head
            self.linguistic_breakdown_tail = new_tail_prefix + list(tail)
        else:
            raise Exception(f"Unknown {settings.MORPHODICT_TAG_STYLE=}")

        self.lexical_info = get_lexical_info(result.wordform.analysis,
                                             animate_emoji, self.dict_source)

        self.preverbs = [
            lexical_entry["entry"] for lexical_entry in self.lexical_info
            if lexical_entry["type"] == "Preverb"
        ]
        self.reduplication = [
            lexical_entry["entry"] for lexical_entry in self.lexical_info
            if lexical_entry["type"] == "Reduplication"
        ]

        self.friendly_linguistic_breakdown_head = replace_user_friendly_tags(
            to_list_of_fst_tags(self.linguistic_breakdown_head))
        self.friendly_linguistic_breakdown_tail = replace_user_friendly_tags(
            to_list_of_fst_tags(self.linguistic_breakdown_tail))
def get_lexical_info(result_analysis: RichAnalysis, animate_emoji: str,
                     dict_source: list) -> List[Dict]:
    if not result_analysis:
        return []

    result_analysis_tags = result_analysis.prefix_tags
    first_letters = extract_first_letters(result_analysis)

    lexical_info: List[Dict] = []

    for (i, tag) in enumerate(result_analysis_tags):
        preverb_result: Optional[Preverb] = None
        reduplication_string: Optional[str] = None
        _type: Optional[LexicalEntryType] = None
        entry: Optional[_ReduplicationResult | SerializedWordform
                        | _InitialChangeResult] = None

        if tag in ["RdplW+", "RdplS+"]:
            reduplication_string = generate_reduplication_string(
                tag, first_letters[i + 1])

        elif tag == "IC+":
            change_types = get_initial_change_types()
            _type = "Initial Change"
            entry = _InitialChangeResult(text=" ",
                                         definitions=change_types).serialize()

        elif tag.startswith("PV/"):
            # use altlabel.tsv to figure out the preverb

            # ling_short looks like: "Preverb: âpihci-"
            ling_short = read_labels().linguistic_short.get(
                cast(FSTTag, tag.rstrip("+")))
            if ling_short:
                # convert to "âpihci" by dropping prefix and last character
                normative_preverb_text = ling_short[len("Preverb: "):]
                preverb_results = Wordform.objects.filter(
                    text=normative_preverb_text, raw_analysis__isnull=True)

                # find the one that looks the most similar
                if preverb_results:
                    preverb_result = min(
                        preverb_results,
                        key=lambda pr: get_modified_distance(
                            normative_preverb_text,
                            pr.text.strip("-"),
                        ),
                    )

                else:
                    # Can't find a match for the preverb in the database.
                    # This happens when searching against the test database for
                    # ê-kî-nitawi-kâh-kîmôci-kotiskâwêyâhk, as the test database
                    # lacks lacks ê and kî.
                    preverb_result = Wordform(text=normative_preverb_text,
                                              is_lemma=True)

        if reduplication_string is not None:
            entry = _ReduplicationResult(
                text=reduplication_string,
                definitions=[{
                    "text":
                    "Strong reduplication: intermittent, repeatedly, iteratively; again and again; here and there"
                    if tag == "RdplS+" else
                    "Weak Reduplication: ongoing, continuing"
                }],
            ).serialize()
            _type = "Reduplication"

        if preverb_result is not None:
            entry = serialize_wordform(preverb_result, animate_emoji,
                                       dict_source)
            _type = "Preverb"

        if entry and _type:
            result = _LexicalEntry(entry=entry, type=_type, original_tag=tag)
            lexical_info.append(serialize_lexical_entry(result))

    return lexical_info
def replace_user_friendly_tags(fst_tags: List[FSTTag]) -> List[Label]:
    """replace fst-tags to cute ones"""
    return read_labels().english.get_full_relabelling(fst_tags)