def test_get_distance_basic_lev(spelling: str, normal_form: str): """ make sure the modified distance gets basic lev distance right """ assume("h" not in spelling.lower() and "h" not in normal_form.lower()) assert get_modified_distance(spelling, normal_form) == distance( spelling.lower(), normal_form.lower())
def fetch_results_from_source_language_keywords(search_run): res = SourceLanguageKeyword.objects.filter( Q(text=to_source_language_keyword(search_run.internal_query))) for kw in res: search_run.add_result( Result( kw.wordform, source_language_keyword_match=[kw.text], query_wordform_edit_distance=get_modified_distance( search_run.internal_query, kw.wordform.text), ))
def do_source_language_affix_search(search_run: core.SearchRun): matching_words = do_affix_search( search_run.internal_query, cache.source_language_affix_searcher, ) for word in matching_words: search_run.add_result( Result( word, source_language_affix_match=True, query_wordform_edit_distance=get_modified_distance( word.text, search_run.internal_query ), ) )
def fetch_results(search_run: core.SearchRun): fetch_results_from_target_language_keywords(search_run) fetch_results_from_source_language_keywords(search_run) # Use the spelling relaxation to try to decipher the query # e.g., "atchakosuk" becomes "acâhkos+N+A+Pl" -- # thus, we can match "acâhkos" in the dictionary! fst_analyses = set(rich_analyze_relaxed(search_run.internal_query)) print([a.tuple for a in fst_analyses]) db_matches = list( Wordform.objects.filter( raw_analysis__in=[a.tuple for a in fst_analyses])) for wf in db_matches: search_run.add_result( Result( wf, source_language_match=wf.text, query_wordform_edit_distance=get_modified_distance( wf.text, search_run.internal_query), )) # An exact match here means we’re done with this analysis. fst_analyses.discard(wf.analysis) # fst_analyses has now been thinned by calls to `fst_analyses.remove()` # above; remaining items are analyses which are not in the database, # although their lemmas should be. for analysis in fst_analyses: # When the user query is outside of paradigm tables # e.g. mad preverb and reduplication: ê-mâh-misi-nâh-nôcihikocik # e.g. Initial change: nêpât: {'IC+nipâw+V+AI+Cnj+3Sg'} normatized_form_for_analysis = strict_generator().lookup( analysis.smushed()) if len(normatized_form_for_analysis) == 0: logger.error( "Cannot generate normative form for analysis: %s (query: %s)", analysis, search_run.internal_query, ) continue # If there are multiple forms for this analysis, use the one that is # closest to what the user typed. normatized_user_query = min( normatized_form_for_analysis, key=lambda f: get_modified_distance(f, search_run.internal_query), ) possible_lemma_wordforms = best_lemma_matches( analysis, Wordform.objects.filter(text=analysis.lemma, is_lemma=True)) for lemma_wordform in possible_lemma_wordforms: synthetic_wordform = Wordform( text=normatized_user_query, raw_analysis=analysis.tuple, lemma=lemma_wordform, ) search_run.add_result( Result( synthetic_wordform, analyzable_inflection_match=True, query_wordform_edit_distance=get_modified_distance( search_run.internal_query, normatized_user_query, ), ))
def test_get_distance(spelling: str, normal_form: str, expected_distance): assert get_modified_distance(spelling, normal_form) == expected_distance
def get_lexical_info(result_analysis: RichAnalysis, animate_emoji: str, dict_source: list) -> List[Dict]: if not result_analysis: return [] result_analysis_tags = result_analysis.prefix_tags first_letters = extract_first_letters(result_analysis) lexical_info: List[Dict] = [] for (i, tag) in enumerate(result_analysis_tags): preverb_result: Optional[Preverb] = None reduplication_string: Optional[str] = None _type: Optional[LexicalEntryType] = None entry: Optional[_ReduplicationResult | SerializedWordform | _InitialChangeResult] = None if tag in ["RdplW+", "RdplS+"]: reduplication_string = generate_reduplication_string( tag, first_letters[i + 1]) elif tag == "IC+": change_types = get_initial_change_types() _type = "Initial Change" entry = _InitialChangeResult(text=" ", definitions=change_types).serialize() elif tag.startswith("PV/"): # use altlabel.tsv to figure out the preverb # ling_short looks like: "Preverb: âpihci-" ling_short = read_labels().linguistic_short.get( cast(FSTTag, tag.rstrip("+"))) if ling_short: # convert to "âpihci" by dropping prefix and last character normative_preverb_text = ling_short[len("Preverb: "):] preverb_results = Wordform.objects.filter( text=normative_preverb_text, raw_analysis__isnull=True) # find the one that looks the most similar if preverb_results: preverb_result = min( preverb_results, key=lambda pr: get_modified_distance( normative_preverb_text, pr.text.strip("-"), ), ) else: # Can't find a match for the preverb in the database. # This happens when searching against the test database for # ê-kî-nitawi-kâh-kîmôci-kotiskâwêyâhk, as the test database # lacks lacks ê and kî. preverb_result = Wordform(text=normative_preverb_text, is_lemma=True) if reduplication_string is not None: entry = _ReduplicationResult( text=reduplication_string, definitions=[{ "text": "Strong reduplication: intermittent, repeatedly, iteratively; again and again; here and there" if tag == "RdplS+" else "Weak Reduplication: ongoing, continuing" }], ).serialize() _type = "Reduplication" if preverb_result is not None: entry = serialize_wordform(preverb_result, animate_emoji, dict_source) _type = "Preverb" if entry and _type: result = _LexicalEntry(entry=entry, type=_type, original_tag=tag) lexical_info.append(serialize_lexical_entry(result)) return lexical_info