def do_cvd_search(search_run: SearchRun):
    """Use cosine vector distance to add results to the search run.

    Keywords from the query string are turned into vectors from Google News,
    added together, and then compared against pre-computed definition vectors.
    """
    keys = extract_keyed_words(search_run.query.query_string,
                               google_news_vectors())
    if not keys:
        return

    search_run.add_verbose_message(cvd_extracted_keys=keys)
    query_vector = vector_for_keys(google_news_vectors(), keys)

    try:
        closest = definition_vectors().similar_by_vector(query_vector, 50)
    except DefinitionVectorsNotFoundException:
        logger.exception("")
        return

    wordform_queries = [
        cvd_key_to_wordform_query(similarity) for similarity, weight in closest
    ]
    similarities = [similarity for cvd_key, similarity in closest]

    # Get all possible wordforms in one big query. We will select more than we
    # need, then filter it down later, but this will have to do until we get
    # better homonym handling.
    wordform_results = Wordform.objects.filter(text__in=set(
        wf["text"] for wf in wordform_queries))

    # Now match back up
    wordforms_by_text = {
        text: list(wordforms)
        for text, wordforms in itertools.groupby(wordform_results,
                                                 key=lambda x: x.text)
    }

    for similarity, wordform_query in zip(similarities, wordform_queries):
        # gensim uses the terminology, similarity = 1 - distance. Its
        # similarity is a number from 0 to 1, with more similar items having
        # similarity closer to 1. A distance should be small for things that
        # are close together.
        distance = 1 - similarity

        wordforms_for_query = wordforms_by_text.get(wordform_query["text"],
                                                    None)
        if wordforms_for_query is None:
            logger.warning(
                f"Wordform {wordform_query['text']} not found in CVD; mismatch between definition vector model file and definitions in database?"
            )
        else:
            for wf in wordforms_for_query:
                if wordform_query_matches(wordform_query, wf):
                    search_run.add_result(
                        Result(wf, cosine_vector_distance=distance))
def test_espt_search_doesnt_crash_when_no_analysis(db):
    search_run = SearchRun("my little bears")
    espt_search = EsptSearch(search_run)
    espt_search.analyze_query()

    wordform = Wordform(text="pê-")
    wordform.lemma = wordform
    wordform.is_lemma = True
    search_run.add_result(
        Result(wordform=wordform, target_language_keyword_match=["bear"]))

    # This will crash if the espt code doesn’t handle results without an analysis
    espt_search.inflect_search_results()
Example #3
0
def search(*,
           query: str,
           include_affixes=True,
           include_auto_definitions=False) -> SearchRun:
    """
    Perform an actual search, using the provided options.

    This class encapsulates the logic of which search methods to try, and in
    which order, to build up results in a SearchRun.
    """
    search_run = SearchRun(query=query,
                           include_auto_definitions=include_auto_definitions)

    if search_run.query.espt:
        espt_search = EsptSearch(search_run)
        espt_search.analyze_query()

    if settings.MORPHODICT_ENABLE_CVD:
        cvd_search_type = cast_away_optional(
            first_non_none_value(search_run.query.cvd,
                                 default=CvdSearchType.DEFAULT))

        # For when you type 'cvd:exclusive' in a query to debug ONLY CVD results!
        if cvd_search_type == CvdSearchType.EXCLUSIVE:

            def sort_by_cvd(r: Result):
                return r.cosine_vector_distance

            search_run.sort_function = sort_by_cvd
            do_cvd_search(search_run)
            return search_run

    fetch_results(search_run)

    if (settings.MORPHODICT_ENABLE_AFFIX_SEARCH and include_affixes
            and not query_would_return_too_many_results(
                search_run.internal_query)):
        do_source_language_affix_search(search_run)
        do_target_language_affix_search(search_run)

    if settings.MORPHODICT_ENABLE_CVD:
        if cvd_search_type.should_do_search(
        ) and not is_almost_certainly_cree(search_run):
            do_cvd_search(search_run)

    if search_run.query.espt:
        espt_search.inflect_search_results()

    return search_run
Example #4
0
def find_pos_matches(search_run: SearchRun) -> None:

    analyzed_query = AnalyzedQuery(search_run.internal_query)
    # print(search_run.verbose_messages["new_tags"])

    if len(search_run.verbose_messages) <= 1:
        return
    tags = search_run.verbose_messages[1].get("tags")
    [pos_match(result, tags) for result in search_run.unsorted_results()]
Example #5
0
def is_almost_certainly_cree(search_run: SearchRun) -> bool:
    """
    Heuristics intended to AVOID doing an English search.
    """
    query = search_run.query

    # If there is a word with two or more dashes in it, it's probably Cree:
    if any(term.count("-") >= 2 for term in query.query_terms):
        search_run.add_verbose_message(
            "Skipping CVD because query has too many hyphens"
        )
        return True

    if CREE_LONG_VOWEL.search(query.query_string):
        search_run.add_verbose_message("Skipping CVD because query has Cree diacritics")
        return True

    return False
def test_espt_search(db, search, params):
    search_run = SearchRun(search)
    espt_search = EsptSearch(search_run)
    espt_search.analyze_query()
    assert search_run.query.query_terms == params["expected_query_terms"]
    assert search_run.query.query_string == " ".join(
        params["expected_query_terms"])
    assert espt_search.new_tags == params["expected_new_tags"]

    lemma1 = Wordform.objects.get(slug=params["slug"], is_lemma=True)

    search_run.add_result(
        Result(
            wordform=lemma1,
            target_language_keyword_match=params["expected_query_terms"],
        ))

    espt_search.inflect_search_results()

    assert params["expected_inflection"] in [
        entry.wordform.text for entry in list(search_run.unsorted_results())
    ]