Example #1
0
def test_get_distance_basic_lev(spelling: str, normal_form: str):
    """
    make sure the modified distance gets basic lev distance right
    """
    assume("h" not in spelling.lower() and "h" not in normal_form.lower())

    assert get_modified_distance(spelling, normal_form) == distance(
        spelling.lower(), normal_form.lower())
Example #2
0
def fetch_results_from_source_language_keywords(search_run):
    res = SourceLanguageKeyword.objects.filter(
        Q(text=to_source_language_keyword(search_run.internal_query)))
    for kw in res:
        search_run.add_result(
            Result(
                kw.wordform,
                source_language_keyword_match=[kw.text],
                query_wordform_edit_distance=get_modified_distance(
                    search_run.internal_query, kw.wordform.text),
            ))
def do_source_language_affix_search(search_run: core.SearchRun):
    matching_words = do_affix_search(
        search_run.internal_query,
        cache.source_language_affix_searcher,
    )
    for word in matching_words:
        search_run.add_result(
            Result(
                word,
                source_language_affix_match=True,
                query_wordform_edit_distance=get_modified_distance(
                    word.text, search_run.internal_query
                ),
            )
        )
Example #4
0
def fetch_results(search_run: core.SearchRun):
    fetch_results_from_target_language_keywords(search_run)
    fetch_results_from_source_language_keywords(search_run)

    # Use the spelling relaxation to try to decipher the query
    #   e.g., "atchakosuk" becomes "acâhkos+N+A+Pl" --
    #         thus, we can match "acâhkos" in the dictionary!
    fst_analyses = set(rich_analyze_relaxed(search_run.internal_query))
    print([a.tuple for a in fst_analyses])

    db_matches = list(
        Wordform.objects.filter(
            raw_analysis__in=[a.tuple for a in fst_analyses]))

    for wf in db_matches:
        search_run.add_result(
            Result(
                wf,
                source_language_match=wf.text,
                query_wordform_edit_distance=get_modified_distance(
                    wf.text, search_run.internal_query),
            ))

        # An exact match here means we’re done with this analysis.
        fst_analyses.discard(wf.analysis)

    # fst_analyses has now been thinned by calls to `fst_analyses.remove()`
    # above; remaining items are analyses which are not in the database,
    # although their lemmas should be.
    for analysis in fst_analyses:
        # When the user query is outside of paradigm tables
        # e.g. mad preverb and reduplication: ê-mâh-misi-nâh-nôcihikocik
        # e.g. Initial change: nêpât: {'IC+nipâw+V+AI+Cnj+3Sg'}

        normatized_form_for_analysis = strict_generator().lookup(
            analysis.smushed())
        if len(normatized_form_for_analysis) == 0:
            logger.error(
                "Cannot generate normative form for analysis: %s (query: %s)",
                analysis,
                search_run.internal_query,
            )
            continue

        # If there are multiple forms for this analysis, use the one that is
        # closest to what the user typed.
        normatized_user_query = min(
            normatized_form_for_analysis,
            key=lambda f: get_modified_distance(f, search_run.internal_query),
        )

        possible_lemma_wordforms = best_lemma_matches(
            analysis,
            Wordform.objects.filter(text=analysis.lemma, is_lemma=True))

        for lemma_wordform in possible_lemma_wordforms:
            synthetic_wordform = Wordform(
                text=normatized_user_query,
                raw_analysis=analysis.tuple,
                lemma=lemma_wordform,
            )
            search_run.add_result(
                Result(
                    synthetic_wordform,
                    analyzable_inflection_match=True,
                    query_wordform_edit_distance=get_modified_distance(
                        search_run.internal_query,
                        normatized_user_query,
                    ),
                ))
Example #5
0
def test_get_distance(spelling: str, normal_form: str, expected_distance):
    assert get_modified_distance(spelling, normal_form) == expected_distance
def get_lexical_info(result_analysis: RichAnalysis, animate_emoji: str,
                     dict_source: list) -> List[Dict]:
    if not result_analysis:
        return []

    result_analysis_tags = result_analysis.prefix_tags
    first_letters = extract_first_letters(result_analysis)

    lexical_info: List[Dict] = []

    for (i, tag) in enumerate(result_analysis_tags):
        preverb_result: Optional[Preverb] = None
        reduplication_string: Optional[str] = None
        _type: Optional[LexicalEntryType] = None
        entry: Optional[_ReduplicationResult | SerializedWordform
                        | _InitialChangeResult] = None

        if tag in ["RdplW+", "RdplS+"]:
            reduplication_string = generate_reduplication_string(
                tag, first_letters[i + 1])

        elif tag == "IC+":
            change_types = get_initial_change_types()
            _type = "Initial Change"
            entry = _InitialChangeResult(text=" ",
                                         definitions=change_types).serialize()

        elif tag.startswith("PV/"):
            # use altlabel.tsv to figure out the preverb

            # ling_short looks like: "Preverb: âpihci-"
            ling_short = read_labels().linguistic_short.get(
                cast(FSTTag, tag.rstrip("+")))
            if ling_short:
                # convert to "âpihci" by dropping prefix and last character
                normative_preverb_text = ling_short[len("Preverb: "):]
                preverb_results = Wordform.objects.filter(
                    text=normative_preverb_text, raw_analysis__isnull=True)

                # find the one that looks the most similar
                if preverb_results:
                    preverb_result = min(
                        preverb_results,
                        key=lambda pr: get_modified_distance(
                            normative_preverb_text,
                            pr.text.strip("-"),
                        ),
                    )

                else:
                    # Can't find a match for the preverb in the database.
                    # This happens when searching against the test database for
                    # ê-kî-nitawi-kâh-kîmôci-kotiskâwêyâhk, as the test database
                    # lacks lacks ê and kî.
                    preverb_result = Wordform(text=normative_preverb_text,
                                              is_lemma=True)

        if reduplication_string is not None:
            entry = _ReduplicationResult(
                text=reduplication_string,
                definitions=[{
                    "text":
                    "Strong reduplication: intermittent, repeatedly, iteratively; again and again; here and there"
                    if tag == "RdplS+" else
                    "Weak Reduplication: ongoing, continuing"
                }],
            ).serialize()
            _type = "Reduplication"

        if preverb_result is not None:
            entry = serialize_wordform(preverb_result, animate_emoji,
                                       dict_source)
            _type = "Preverb"

        if entry and _type:
            result = _LexicalEntry(entry=entry, type=_type, original_tag=tag)
            lexical_info.append(serialize_lexical_entry(result))

    return lexical_info