Esempio n. 1
0
def build_result(
    wordform_length=0,
    target_language_keyword_match_len=0,
    **result_kwargs,
):
    wf = Wordform(text="f" * wordform_length, is_lemma=True)
    wf.lemma = wf

    result_kwargs.setdefault("target_language_keyword_match",
                             ["x"] * target_language_keyword_match_len)
    result = Result(wf, **result_kwargs)
    return result
def test_espt_search_doesnt_crash_when_no_analysis(db):
    search_run = SearchRun("my little bears")
    espt_search = EsptSearch(search_run)
    espt_search.analyze_query()

    wordform = Wordform(text="pê-")
    wordform.lemma = wordform
    wordform.is_lemma = True
    search_run.add_result(
        Result(wordform=wordform, target_language_keyword_match=["bear"]))

    # This will crash if the espt code doesn’t handle results without an analysis
    espt_search.inflect_search_results()
    def populate_wordform_definitions(self, wf, senses):
        should_do_translation = self.translate_wordforms

        if should_do_translation:
            has_analysis_and_paradigm = (
                (wf.analysis and wf.paradigm)
                if not settings.MORPHODICT_ENABLE_FST_LEMMA_SUPPORT else
                (wf.fst_lemma and wf.paradigm))

            if not has_analysis_and_paradigm:
                should_do_translation = False

        definitions_and_sources = self.create_definitions(wf, senses)

        if not should_do_translation:
            return

        lemma_text = (wf.text
                      if not settings.MORPHODICT_ENABLE_FST_LEMMA_SUPPORT else
                      wf.fst_lemma)

        for (
                prefix_tags,
                suffix_tags,
        ) in self.paradigm_manager.all_analysis_template_tags(wf.paradigm):
            analysis = RichAnalysis((prefix_tags, lemma_text, suffix_tags))
            for generated in strict_generator().lookup(analysis.smushed()):
                # Skip re-instantiating lemma
                if analysis == wf.analysis:
                    continue

                inflected_wordform = Wordform(
                    # For now, leaving paradigm and linguist_info empty;
                    # code can get that info from the lemma instead.
                    text=generated,
                    raw_analysis=analysis.tuple,
                    lemma=wf,
                    is_lemma=False,
                )

                for d, sources in definitions_and_sources:

                    translation = translate_single_definition(
                        inflected_wordform, d.text, self.translation_stats)
                    if translation is None:
                        continue

                    is_inflected_wordform_unsaved = inflected_wordform.id is None
                    if is_inflected_wordform_unsaved:
                        self.wordform_buffer.add(inflected_wordform)

                    self._add_definition(
                        inflected_wordform,
                        translation,
                        ("🤖" + source for source in sources),
                        auto_translation_source=d,
                    )
Esempio n. 4
0
    def inflect_search_results(self):
        if not self.query_analyzed_ok:
            return

        inflected_results = self._generate_inflected_results()

        # aggregating queries for performance
        possible_wordforms = Wordform.objects.filter(
            text__in=[r.inflected_text for r in inflected_results])
        wordform_lookup = {}
        for wf in possible_wordforms:
            wordform_lookup[(wf.text, wf.lemma_id)] = wf

        for result in inflected_results:
            wordform = wordform_lookup.get(
                (result.inflected_text,
                 result.original_result.lemma_wordform.id))
            if wordform is None:
                # inflected form not found in DB, so create a synthetic one. Can
                # happen for Plains Cree, when the EIP search produces a valid
                # analysis not covered by any paradigm file.
                #
                # Note: would not have auto-translations since those are
                # currently only available for wordforms that were previously
                # saved in the DB.
                lemma = result.original_result.lemma_wordform

                wordform = Wordform(
                    text=result.inflected_text,
                    lemma=lemma,
                    raw_analysis=result.analysis.tuple,
                )

            # if there are multiple inflections for the same original result, we
            # may already have removed it
            if self.search_run.has_result(result.original_result):
                self.search_run.remove_result(result.original_result)

            self.search_run.add_result(
                result.original_result.create_related_result(
                    wordform,
                    is_espt_result=True,
                ))
def serialize_wordform(wordform: Wordform, animate_emoji: str,
                       dict_source: list) -> SerializedWordform:
    """
    Intended to be passed in a JSON API or into templates.

    :return: json parsable result
    """
    result = model_to_dict(wordform)
    result["definitions"] = serialize_definitions(wordform.definitions.all(),
                                                  dict_source=dict_source)
    result["lemma_url"] = wordform.get_absolute_url()

    if wordform.linguist_info:
        if inflectional_category := wordform.linguist_info.get(
                "inflectional_category", None):
            result.update({
                "inflectional_category_plain_english":
                read_labels().english.get(inflectional_category),
                "inflectional_category_linguistic":
                read_labels().linguistic_long.get(inflectional_category),
            })
        if wordclass := wordform.linguist_info.get("wordclass"):
            result["wordclass_emoji"] = get_emoji_for_cree_wordclass(
                wordclass, animate_emoji)
Esempio n. 6
0
def fetch_results(search_run: core.SearchRun):
    fetch_results_from_target_language_keywords(search_run)
    fetch_results_from_source_language_keywords(search_run)

    # Use the spelling relaxation to try to decipher the query
    #   e.g., "atchakosuk" becomes "acâhkos+N+A+Pl" --
    #         thus, we can match "acâhkos" in the dictionary!
    fst_analyses = set(rich_analyze_relaxed(search_run.internal_query))
    print([a.tuple for a in fst_analyses])

    db_matches = list(
        Wordform.objects.filter(
            raw_analysis__in=[a.tuple for a in fst_analyses]))

    for wf in db_matches:
        search_run.add_result(
            Result(
                wf,
                source_language_match=wf.text,
                query_wordform_edit_distance=get_modified_distance(
                    wf.text, search_run.internal_query),
            ))

        # An exact match here means we’re done with this analysis.
        fst_analyses.discard(wf.analysis)

    # fst_analyses has now been thinned by calls to `fst_analyses.remove()`
    # above; remaining items are analyses which are not in the database,
    # although their lemmas should be.
    for analysis in fst_analyses:
        # When the user query is outside of paradigm tables
        # e.g. mad preverb and reduplication: ê-mâh-misi-nâh-nôcihikocik
        # e.g. Initial change: nêpât: {'IC+nipâw+V+AI+Cnj+3Sg'}

        normatized_form_for_analysis = strict_generator().lookup(
            analysis.smushed())
        if len(normatized_form_for_analysis) == 0:
            logger.error(
                "Cannot generate normative form for analysis: %s (query: %s)",
                analysis,
                search_run.internal_query,
            )
            continue

        # If there are multiple forms for this analysis, use the one that is
        # closest to what the user typed.
        normatized_user_query = min(
            normatized_form_for_analysis,
            key=lambda f: get_modified_distance(f, search_run.internal_query),
        )

        possible_lemma_wordforms = best_lemma_matches(
            analysis,
            Wordform.objects.filter(text=analysis.lemma, is_lemma=True))

        for lemma_wordform in possible_lemma_wordforms:
            synthetic_wordform = Wordform(
                text=normatized_user_query,
                raw_analysis=analysis.tuple,
                lemma=lemma_wordform,
            )
            search_run.add_result(
                Result(
                    synthetic_wordform,
                    analyzable_inflection_match=True,
                    query_wordform_edit_distance=get_modified_distance(
                        search_run.internal_query,
                        normatized_user_query,
                    ),
                ))
 def location(self, item: Wordform):
     return item.get_absolute_url(ambiguity="allow")
class Import:
    def __init__(
        self,
        importjson: list,
        translate_wordforms: bool,
        purge: bool,
        incremental: bool,
        atomic=True,
        skip_building_vectors_because_testing=False,
    ):
        """
        Create an Import process.

        If atomic is False, this will use batch processing that still works when
        not in a transaction.
        """
        self.dictionary_source_cache = DictionarySourceCache()
        self.data = importjson
        self.translate_wordforms = translate_wordforms
        self.incremental = incremental
        self.purge = purge
        self.skip_building_vectors_because_testing = (
            skip_building_vectors_because_testing)

        self._has_run = False

        self.paradigm_manager = default_paradigm_manager()
        self.translation_stats = TranslationStats()

        trigger_deps = not atomic

        self.wordform_buffer = InsertBuffer(Wordform.objects, assign_id=True)
        self.definition_buffer = InsertBuffer(
            Definition.objects,
            assign_id=True,
            trigger_deps=trigger_deps,
            deps=[self.wordform_buffer],
        )
        self.citation_buffer = InsertBuffer(
            Definition.citations.through.objects,
            trigger_deps=trigger_deps,
            deps=[self.definition_buffer],
        )
        self.source_language_keyword_buffer = InsertBuffer(
            SourceLanguageKeyword.objects,
            trigger_deps=trigger_deps,
            deps=[self.wordform_buffer],
        )
        self.target_language_keyword_buffer = InsertBuffer(
            TargetLanguageKeyword.objects,
            trigger_deps=trigger_deps,
            deps=[self.wordform_buffer],
        )

    def run(self):
        """Run the import process.

        This is the only method that external code should call.
        """
        if self._has_run:
            raise Exception("run can only be called once")
        self._has_run = True

        freshness_check = FreshnessCheck(self.data)

        seen_slugs = set()
        if self.purge:
            existing_slugs = self.gather_slugs()

        form_definitions = []

        for entry in tqdm(self.data, smoothing=0):
            if "formOf" in entry:
                form_definitions.append(entry)
                continue

            if len(entry["senses"]) == 0:
                raise Exception(f'Error: no senses for slug {entry["slug"]}')
            for sense in entry["senses"]:
                if "definition" not in sense:
                    raise Exception(
                        f'Error: no "definition" in sense {sense!r} of slug {entry["slug"]}'
                    )

            seen_slugs.add(validate_slug_format(entry["slug"]))

            if self.incremental and freshness_check.is_fresh(entry["slug"]):
                continue

            if existing := Wordform.objects.filter(slug=entry["slug"]).first():
                # Cascade should take care of all related objects.
                existing.delete()

            fst_lemma = None
            if "fstLemma" in entry:
                fst_lemma = entry["fstLemma"]
            elif (analysis := entry.get("analysis")) is not None:
                fst_lemma = analysis[1]

            wf = Wordform(
                text=entry["head"],
                raw_analysis=entry.get("analysis", None),
                fst_lemma=fst_lemma,
                paradigm=entry.get("paradigm", None),
                slug=entry["slug"],
                is_lemma=True,
                linguist_info=entry.get("linguistInfo", {}),
                import_hash=freshness_check.importjson_hash_for_slug(
                    entry["slug"]),
            )
            self.wordform_buffer.add(wf)
            assert wf.id is not None
            wf.lemma_id = wf.id

            if "senses" not in entry:
                raise Exception(
                    f"Invalid importjson: no senses for lemma text={wf.text} slug={wf.slug}"
                )

            self.populate_wordform_definitions(wf, entry["senses"])

            # Avoid dupes for this wordform
            seen_source_language_keywords: set[str] = set()

            slug_base = wf.slug.split("@")[0]
            if wf.text != slug_base and slug_base:
                self.add_source_language_keyword(
                    wf, slug_base, seen_source_language_keywords)
            if wf.fst_lemma and wf.text != wf.fst_lemma:
                self.add_source_language_keyword(
                    wf, wf.fst_lemma, seen_source_language_keywords)
            if wf.raw_analysis is None:
                self.index_unanalyzed_form(wf, seen_source_language_keywords)
 def make_wf(text: str = "foo"):
     ret = Wordform(text=text, is_lemma=True)
     ret.lemma = ret
     return ret
def get_lexical_info(result_analysis: RichAnalysis, animate_emoji: str,
                     dict_source: list) -> List[Dict]:
    if not result_analysis:
        return []

    result_analysis_tags = result_analysis.prefix_tags
    first_letters = extract_first_letters(result_analysis)

    lexical_info: List[Dict] = []

    for (i, tag) in enumerate(result_analysis_tags):
        preverb_result: Optional[Preverb] = None
        reduplication_string: Optional[str] = None
        _type: Optional[LexicalEntryType] = None
        entry: Optional[_ReduplicationResult | SerializedWordform
                        | _InitialChangeResult] = None

        if tag in ["RdplW+", "RdplS+"]:
            reduplication_string = generate_reduplication_string(
                tag, first_letters[i + 1])

        elif tag == "IC+":
            change_types = get_initial_change_types()
            _type = "Initial Change"
            entry = _InitialChangeResult(text=" ",
                                         definitions=change_types).serialize()

        elif tag.startswith("PV/"):
            # use altlabel.tsv to figure out the preverb

            # ling_short looks like: "Preverb: âpihci-"
            ling_short = read_labels().linguistic_short.get(
                cast(FSTTag, tag.rstrip("+")))
            if ling_short:
                # convert to "âpihci" by dropping prefix and last character
                normative_preverb_text = ling_short[len("Preverb: "):]
                preverb_results = Wordform.objects.filter(
                    text=normative_preverb_text, raw_analysis__isnull=True)

                # find the one that looks the most similar
                if preverb_results:
                    preverb_result = min(
                        preverb_results,
                        key=lambda pr: get_modified_distance(
                            normative_preverb_text,
                            pr.text.strip("-"),
                        ),
                    )

                else:
                    # Can't find a match for the preverb in the database.
                    # This happens when searching against the test database for
                    # ê-kî-nitawi-kâh-kîmôci-kotiskâwêyâhk, as the test database
                    # lacks lacks ê and kî.
                    preverb_result = Wordform(text=normative_preverb_text,
                                              is_lemma=True)

        if reduplication_string is not None:
            entry = _ReduplicationResult(
                text=reduplication_string,
                definitions=[{
                    "text":
                    "Strong reduplication: intermittent, repeatedly, iteratively; again and again; here and there"
                    if tag == "RdplS+" else
                    "Weak Reduplication: ongoing, continuing"
                }],
            ).serialize()
            _type = "Reduplication"

        if preverb_result is not None:
            entry = serialize_wordform(preverb_result, animate_emoji,
                                       dict_source)
            _type = "Preverb"

        if entry and _type:
            result = _LexicalEntry(entry=entry, type=_type, original_tag=tag)
            lexical_info.append(serialize_lexical_entry(result))

    return lexical_info
Esempio n. 11
0
def get_lexical_info(
    result_analysis: RichAnalysis,
    animate_emoji: str,
    show_emoji: str,
    dict_source: list,
) -> List[Dict]:
    if not result_analysis:
        return []

    result_analysis_tags = result_analysis.prefix_tags
    first_letters = extract_first_letters(result_analysis)

    lexical_info: List[Dict] = []

    for (i, tag) in enumerate(result_analysis_tags):
        preverb_result: Optional[Preverb] = None
        reduplication_string: Optional[str] = None
        _type: Optional[LexicalEntryType] = None
        entry: Optional[_ReduplicationResult | SerializedWordform
                        | _InitialChangeResult] = None

        if tag in ["RdplW+", "RdplS+"]:
            reduplication_string = generate_reduplication_string(
                tag, first_letters[i + 1])

        elif tag == "IC+":
            change_types = get_initial_change_types()
            _type = "Initial Change"
            entry = _InitialChangeResult(text=" ",
                                         definitions=change_types).serialize()

        elif tag.startswith("PV/"):
            preverb_text = tag.replace("PV/", "").replace("+", "")

            # Our FST analyzer doesn't return preverbs with diacritics
            # but we store variations of words in this table
            preverb_results = SourceLanguageKeyword.objects.filter(
                text=preverb_text)

            # get the actual wordform object and
            # make sure the result we return is an IPV
            if preverb_results:
                preverb_result = None
                for result in preverb_results:
                    lexicon_result = Wordform.objects.get(
                        id=result.wordform_id)
                    if lexicon_result:
                        _info = lexicon_result.linguist_info
                        if _info["wordclass"] == "IPV":
                            preverb_result = lexicon_result
            else:
                # Can't find a match for the preverb in the database.
                # This happens when searching against the test database for
                # ê-kî-nitawi-kâh-kîmôci-kotiskâwêyâhk, as the test database
                # lacks lacks ê and kî.
                preverb_result = Wordform(text=preverb_text, is_lemma=True)

        if reduplication_string is not None:
            entry = _ReduplicationResult(
                text=reduplication_string,
                definitions=[{
                    "text":
                    "Strong reduplication: intermittent, repeatedly, iteratively; again and again; here and there"
                    if tag == "RdplS+" else
                    "Weak Reduplication: ongoing, continuing"
                }],
            ).serialize()
            _type = "Reduplication"

        if preverb_result is not None:
            entry = serialize_wordform(preverb_result, animate_emoji,
                                       show_emoji, dict_source)
            _type = "Preverb"

        if entry and _type:
            result = _LexicalEntry(entry=entry, type=_type, original_tag=tag)
            lexical_info.append(serialize_lexical_entry(result))

    return lexical_info