def build_result( wordform_length=0, target_language_keyword_match_len=0, **result_kwargs, ): wf = Wordform(text="f" * wordform_length, is_lemma=True) wf.lemma = wf result_kwargs.setdefault("target_language_keyword_match", ["x"] * target_language_keyword_match_len) result = Result(wf, **result_kwargs) return result
def test_espt_search_doesnt_crash_when_no_analysis(db): search_run = SearchRun("my little bears") espt_search = EsptSearch(search_run) espt_search.analyze_query() wordform = Wordform(text="pê-") wordform.lemma = wordform wordform.is_lemma = True search_run.add_result( Result(wordform=wordform, target_language_keyword_match=["bear"])) # This will crash if the espt code doesn’t handle results without an analysis espt_search.inflect_search_results()
def populate_wordform_definitions(self, wf, senses): should_do_translation = self.translate_wordforms if should_do_translation: has_analysis_and_paradigm = ( (wf.analysis and wf.paradigm) if not settings.MORPHODICT_ENABLE_FST_LEMMA_SUPPORT else (wf.fst_lemma and wf.paradigm)) if not has_analysis_and_paradigm: should_do_translation = False definitions_and_sources = self.create_definitions(wf, senses) if not should_do_translation: return lemma_text = (wf.text if not settings.MORPHODICT_ENABLE_FST_LEMMA_SUPPORT else wf.fst_lemma) for ( prefix_tags, suffix_tags, ) in self.paradigm_manager.all_analysis_template_tags(wf.paradigm): analysis = RichAnalysis((prefix_tags, lemma_text, suffix_tags)) for generated in strict_generator().lookup(analysis.smushed()): # Skip re-instantiating lemma if analysis == wf.analysis: continue inflected_wordform = Wordform( # For now, leaving paradigm and linguist_info empty; # code can get that info from the lemma instead. text=generated, raw_analysis=analysis.tuple, lemma=wf, is_lemma=False, ) for d, sources in definitions_and_sources: translation = translate_single_definition( inflected_wordform, d.text, self.translation_stats) if translation is None: continue is_inflected_wordform_unsaved = inflected_wordform.id is None if is_inflected_wordform_unsaved: self.wordform_buffer.add(inflected_wordform) self._add_definition( inflected_wordform, translation, ("🤖" + source for source in sources), auto_translation_source=d, )
def inflect_search_results(self): if not self.query_analyzed_ok: return inflected_results = self._generate_inflected_results() # aggregating queries for performance possible_wordforms = Wordform.objects.filter( text__in=[r.inflected_text for r in inflected_results]) wordform_lookup = {} for wf in possible_wordforms: wordform_lookup[(wf.text, wf.lemma_id)] = wf for result in inflected_results: wordform = wordform_lookup.get( (result.inflected_text, result.original_result.lemma_wordform.id)) if wordform is None: # inflected form not found in DB, so create a synthetic one. Can # happen for Plains Cree, when the EIP search produces a valid # analysis not covered by any paradigm file. # # Note: would not have auto-translations since those are # currently only available for wordforms that were previously # saved in the DB. lemma = result.original_result.lemma_wordform wordform = Wordform( text=result.inflected_text, lemma=lemma, raw_analysis=result.analysis.tuple, ) # if there are multiple inflections for the same original result, we # may already have removed it if self.search_run.has_result(result.original_result): self.search_run.remove_result(result.original_result) self.search_run.add_result( result.original_result.create_related_result( wordform, is_espt_result=True, ))
def serialize_wordform(wordform: Wordform, animate_emoji: str, dict_source: list) -> SerializedWordform: """ Intended to be passed in a JSON API or into templates. :return: json parsable result """ result = model_to_dict(wordform) result["definitions"] = serialize_definitions(wordform.definitions.all(), dict_source=dict_source) result["lemma_url"] = wordform.get_absolute_url() if wordform.linguist_info: if inflectional_category := wordform.linguist_info.get( "inflectional_category", None): result.update({ "inflectional_category_plain_english": read_labels().english.get(inflectional_category), "inflectional_category_linguistic": read_labels().linguistic_long.get(inflectional_category), }) if wordclass := wordform.linguist_info.get("wordclass"): result["wordclass_emoji"] = get_emoji_for_cree_wordclass( wordclass, animate_emoji)
def fetch_results(search_run: core.SearchRun): fetch_results_from_target_language_keywords(search_run) fetch_results_from_source_language_keywords(search_run) # Use the spelling relaxation to try to decipher the query # e.g., "atchakosuk" becomes "acâhkos+N+A+Pl" -- # thus, we can match "acâhkos" in the dictionary! fst_analyses = set(rich_analyze_relaxed(search_run.internal_query)) print([a.tuple for a in fst_analyses]) db_matches = list( Wordform.objects.filter( raw_analysis__in=[a.tuple for a in fst_analyses])) for wf in db_matches: search_run.add_result( Result( wf, source_language_match=wf.text, query_wordform_edit_distance=get_modified_distance( wf.text, search_run.internal_query), )) # An exact match here means we’re done with this analysis. fst_analyses.discard(wf.analysis) # fst_analyses has now been thinned by calls to `fst_analyses.remove()` # above; remaining items are analyses which are not in the database, # although their lemmas should be. for analysis in fst_analyses: # When the user query is outside of paradigm tables # e.g. mad preverb and reduplication: ê-mâh-misi-nâh-nôcihikocik # e.g. Initial change: nêpât: {'IC+nipâw+V+AI+Cnj+3Sg'} normatized_form_for_analysis = strict_generator().lookup( analysis.smushed()) if len(normatized_form_for_analysis) == 0: logger.error( "Cannot generate normative form for analysis: %s (query: %s)", analysis, search_run.internal_query, ) continue # If there are multiple forms for this analysis, use the one that is # closest to what the user typed. normatized_user_query = min( normatized_form_for_analysis, key=lambda f: get_modified_distance(f, search_run.internal_query), ) possible_lemma_wordforms = best_lemma_matches( analysis, Wordform.objects.filter(text=analysis.lemma, is_lemma=True)) for lemma_wordform in possible_lemma_wordforms: synthetic_wordform = Wordform( text=normatized_user_query, raw_analysis=analysis.tuple, lemma=lemma_wordform, ) search_run.add_result( Result( synthetic_wordform, analyzable_inflection_match=True, query_wordform_edit_distance=get_modified_distance( search_run.internal_query, normatized_user_query, ), ))
def location(self, item: Wordform): return item.get_absolute_url(ambiguity="allow")
class Import: def __init__( self, importjson: list, translate_wordforms: bool, purge: bool, incremental: bool, atomic=True, skip_building_vectors_because_testing=False, ): """ Create an Import process. If atomic is False, this will use batch processing that still works when not in a transaction. """ self.dictionary_source_cache = DictionarySourceCache() self.data = importjson self.translate_wordforms = translate_wordforms self.incremental = incremental self.purge = purge self.skip_building_vectors_because_testing = ( skip_building_vectors_because_testing) self._has_run = False self.paradigm_manager = default_paradigm_manager() self.translation_stats = TranslationStats() trigger_deps = not atomic self.wordform_buffer = InsertBuffer(Wordform.objects, assign_id=True) self.definition_buffer = InsertBuffer( Definition.objects, assign_id=True, trigger_deps=trigger_deps, deps=[self.wordform_buffer], ) self.citation_buffer = InsertBuffer( Definition.citations.through.objects, trigger_deps=trigger_deps, deps=[self.definition_buffer], ) self.source_language_keyword_buffer = InsertBuffer( SourceLanguageKeyword.objects, trigger_deps=trigger_deps, deps=[self.wordform_buffer], ) self.target_language_keyword_buffer = InsertBuffer( TargetLanguageKeyword.objects, trigger_deps=trigger_deps, deps=[self.wordform_buffer], ) def run(self): """Run the import process. This is the only method that external code should call. """ if self._has_run: raise Exception("run can only be called once") self._has_run = True freshness_check = FreshnessCheck(self.data) seen_slugs = set() if self.purge: existing_slugs = self.gather_slugs() form_definitions = [] for entry in tqdm(self.data, smoothing=0): if "formOf" in entry: form_definitions.append(entry) continue if len(entry["senses"]) == 0: raise Exception(f'Error: no senses for slug {entry["slug"]}') for sense in entry["senses"]: if "definition" not in sense: raise Exception( f'Error: no "definition" in sense {sense!r} of slug {entry["slug"]}' ) seen_slugs.add(validate_slug_format(entry["slug"])) if self.incremental and freshness_check.is_fresh(entry["slug"]): continue if existing := Wordform.objects.filter(slug=entry["slug"]).first(): # Cascade should take care of all related objects. existing.delete() fst_lemma = None if "fstLemma" in entry: fst_lemma = entry["fstLemma"] elif (analysis := entry.get("analysis")) is not None: fst_lemma = analysis[1] wf = Wordform( text=entry["head"], raw_analysis=entry.get("analysis", None), fst_lemma=fst_lemma, paradigm=entry.get("paradigm", None), slug=entry["slug"], is_lemma=True, linguist_info=entry.get("linguistInfo", {}), import_hash=freshness_check.importjson_hash_for_slug( entry["slug"]), ) self.wordform_buffer.add(wf) assert wf.id is not None wf.lemma_id = wf.id if "senses" not in entry: raise Exception( f"Invalid importjson: no senses for lemma text={wf.text} slug={wf.slug}" ) self.populate_wordform_definitions(wf, entry["senses"]) # Avoid dupes for this wordform seen_source_language_keywords: set[str] = set() slug_base = wf.slug.split("@")[0] if wf.text != slug_base and slug_base: self.add_source_language_keyword( wf, slug_base, seen_source_language_keywords) if wf.fst_lemma and wf.text != wf.fst_lemma: self.add_source_language_keyword( wf, wf.fst_lemma, seen_source_language_keywords) if wf.raw_analysis is None: self.index_unanalyzed_form(wf, seen_source_language_keywords)
def make_wf(text: str = "foo"): ret = Wordform(text=text, is_lemma=True) ret.lemma = ret return ret
def get_lexical_info(result_analysis: RichAnalysis, animate_emoji: str, dict_source: list) -> List[Dict]: if not result_analysis: return [] result_analysis_tags = result_analysis.prefix_tags first_letters = extract_first_letters(result_analysis) lexical_info: List[Dict] = [] for (i, tag) in enumerate(result_analysis_tags): preverb_result: Optional[Preverb] = None reduplication_string: Optional[str] = None _type: Optional[LexicalEntryType] = None entry: Optional[_ReduplicationResult | SerializedWordform | _InitialChangeResult] = None if tag in ["RdplW+", "RdplS+"]: reduplication_string = generate_reduplication_string( tag, first_letters[i + 1]) elif tag == "IC+": change_types = get_initial_change_types() _type = "Initial Change" entry = _InitialChangeResult(text=" ", definitions=change_types).serialize() elif tag.startswith("PV/"): # use altlabel.tsv to figure out the preverb # ling_short looks like: "Preverb: âpihci-" ling_short = read_labels().linguistic_short.get( cast(FSTTag, tag.rstrip("+"))) if ling_short: # convert to "âpihci" by dropping prefix and last character normative_preverb_text = ling_short[len("Preverb: "):] preverb_results = Wordform.objects.filter( text=normative_preverb_text, raw_analysis__isnull=True) # find the one that looks the most similar if preverb_results: preverb_result = min( preverb_results, key=lambda pr: get_modified_distance( normative_preverb_text, pr.text.strip("-"), ), ) else: # Can't find a match for the preverb in the database. # This happens when searching against the test database for # ê-kî-nitawi-kâh-kîmôci-kotiskâwêyâhk, as the test database # lacks lacks ê and kî. preverb_result = Wordform(text=normative_preverb_text, is_lemma=True) if reduplication_string is not None: entry = _ReduplicationResult( text=reduplication_string, definitions=[{ "text": "Strong reduplication: intermittent, repeatedly, iteratively; again and again; here and there" if tag == "RdplS+" else "Weak Reduplication: ongoing, continuing" }], ).serialize() _type = "Reduplication" if preverb_result is not None: entry = serialize_wordform(preverb_result, animate_emoji, dict_source) _type = "Preverb" if entry and _type: result = _LexicalEntry(entry=entry, type=_type, original_tag=tag) lexical_info.append(serialize_lexical_entry(result)) return lexical_info
def get_lexical_info( result_analysis: RichAnalysis, animate_emoji: str, show_emoji: str, dict_source: list, ) -> List[Dict]: if not result_analysis: return [] result_analysis_tags = result_analysis.prefix_tags first_letters = extract_first_letters(result_analysis) lexical_info: List[Dict] = [] for (i, tag) in enumerate(result_analysis_tags): preverb_result: Optional[Preverb] = None reduplication_string: Optional[str] = None _type: Optional[LexicalEntryType] = None entry: Optional[_ReduplicationResult | SerializedWordform | _InitialChangeResult] = None if tag in ["RdplW+", "RdplS+"]: reduplication_string = generate_reduplication_string( tag, first_letters[i + 1]) elif tag == "IC+": change_types = get_initial_change_types() _type = "Initial Change" entry = _InitialChangeResult(text=" ", definitions=change_types).serialize() elif tag.startswith("PV/"): preverb_text = tag.replace("PV/", "").replace("+", "") # Our FST analyzer doesn't return preverbs with diacritics # but we store variations of words in this table preverb_results = SourceLanguageKeyword.objects.filter( text=preverb_text) # get the actual wordform object and # make sure the result we return is an IPV if preverb_results: preverb_result = None for result in preverb_results: lexicon_result = Wordform.objects.get( id=result.wordform_id) if lexicon_result: _info = lexicon_result.linguist_info if _info["wordclass"] == "IPV": preverb_result = lexicon_result else: # Can't find a match for the preverb in the database. # This happens when searching against the test database for # ê-kî-nitawi-kâh-kîmôci-kotiskâwêyâhk, as the test database # lacks lacks ê and kî. preverb_result = Wordform(text=preverb_text, is_lemma=True) if reduplication_string is not None: entry = _ReduplicationResult( text=reduplication_string, definitions=[{ "text": "Strong reduplication: intermittent, repeatedly, iteratively; again and again; here and there" if tag == "RdplS+" else "Weak Reduplication: ongoing, continuing" }], ).serialize() _type = "Reduplication" if preverb_result is not None: entry = serialize_wordform(preverb_result, animate_emoji, show_emoji, dict_source) _type = "Preverb" if entry and _type: result = _LexicalEntry(entry=entry, type=_type, original_tag=tag) lexical_info.append(serialize_lexical_entry(result)) return lexical_info