def do_cvd_search(search_run: SearchRun): """Use cosine vector distance to add results to the search run. Keywords from the query string are turned into vectors from Google News, added together, and then compared against pre-computed definition vectors. """ keys = extract_keyed_words(search_run.query.query_string, google_news_vectors()) if not keys: return search_run.add_verbose_message(cvd_extracted_keys=keys) query_vector = vector_for_keys(google_news_vectors(), keys) try: closest = definition_vectors().similar_by_vector(query_vector, 50) except DefinitionVectorsNotFoundException: logger.exception("") return wordform_queries = [ cvd_key_to_wordform_query(similarity) for similarity, weight in closest ] similarities = [similarity for cvd_key, similarity in closest] # Get all possible wordforms in one big query. We will select more than we # need, then filter it down later, but this will have to do until we get # better homonym handling. wordform_results = Wordform.objects.filter(text__in=set( wf["text"] for wf in wordform_queries)) # Now match back up wordforms_by_text = { text: list(wordforms) for text, wordforms in itertools.groupby(wordform_results, key=lambda x: x.text) } for similarity, wordform_query in zip(similarities, wordform_queries): # gensim uses the terminology, similarity = 1 - distance. Its # similarity is a number from 0 to 1, with more similar items having # similarity closer to 1. A distance should be small for things that # are close together. distance = 1 - similarity wordforms_for_query = wordforms_by_text.get(wordform_query["text"], None) if wordforms_for_query is None: logger.warning( f"Wordform {wordform_query['text']} not found in CVD; mismatch between definition vector model file and definitions in database?" ) else: for wf in wordforms_for_query: if wordform_query_matches(wordform_query, wf): search_run.add_result( Result(wf, cosine_vector_distance=distance))
def test_espt_search_doesnt_crash_when_no_analysis(db): search_run = SearchRun("my little bears") espt_search = EsptSearch(search_run) espt_search.analyze_query() wordform = Wordform(text="pê-") wordform.lemma = wordform wordform.is_lemma = True search_run.add_result( Result(wordform=wordform, target_language_keyword_match=["bear"])) # This will crash if the espt code doesn’t handle results without an analysis espt_search.inflect_search_results()
def search(*, query: str, include_affixes=True, include_auto_definitions=False) -> SearchRun: """ Perform an actual search, using the provided options. This class encapsulates the logic of which search methods to try, and in which order, to build up results in a SearchRun. """ search_run = SearchRun(query=query, include_auto_definitions=include_auto_definitions) if search_run.query.espt: espt_search = EsptSearch(search_run) espt_search.analyze_query() if settings.MORPHODICT_ENABLE_CVD: cvd_search_type = cast_away_optional( first_non_none_value(search_run.query.cvd, default=CvdSearchType.DEFAULT)) # For when you type 'cvd:exclusive' in a query to debug ONLY CVD results! if cvd_search_type == CvdSearchType.EXCLUSIVE: def sort_by_cvd(r: Result): return r.cosine_vector_distance search_run.sort_function = sort_by_cvd do_cvd_search(search_run) return search_run fetch_results(search_run) if (settings.MORPHODICT_ENABLE_AFFIX_SEARCH and include_affixes and not query_would_return_too_many_results( search_run.internal_query)): do_source_language_affix_search(search_run) do_target_language_affix_search(search_run) if settings.MORPHODICT_ENABLE_CVD: if cvd_search_type.should_do_search( ) and not is_almost_certainly_cree(search_run): do_cvd_search(search_run) if search_run.query.espt: espt_search.inflect_search_results() return search_run
def find_pos_matches(search_run: SearchRun) -> None: analyzed_query = AnalyzedQuery(search_run.internal_query) # print(search_run.verbose_messages["new_tags"]) if len(search_run.verbose_messages) <= 1: return tags = search_run.verbose_messages[1].get("tags") [pos_match(result, tags) for result in search_run.unsorted_results()]
def is_almost_certainly_cree(search_run: SearchRun) -> bool: """ Heuristics intended to AVOID doing an English search. """ query = search_run.query # If there is a word with two or more dashes in it, it's probably Cree: if any(term.count("-") >= 2 for term in query.query_terms): search_run.add_verbose_message( "Skipping CVD because query has too many hyphens" ) return True if CREE_LONG_VOWEL.search(query.query_string): search_run.add_verbose_message("Skipping CVD because query has Cree diacritics") return True return False
def test_espt_search(db, search, params): search_run = SearchRun(search) espt_search = EsptSearch(search_run) espt_search.analyze_query() assert search_run.query.query_terms == params["expected_query_terms"] assert search_run.query.query_string == " ".join( params["expected_query_terms"]) assert espt_search.new_tags == params["expected_new_tags"] lemma1 = Wordform.objects.get(slug=params["slug"], is_lemma=True) search_run.add_result( Result( wordform=lemma1, target_language_keyword_match=params["expected_query_terms"], )) espt_search.inflect_search_results() assert params["expected_inflection"] in [ entry.wordform.text for entry in list(search_run.unsorted_results()) ]