Example #1
0
def test_search_for_stored_non_lemma():
    """
    A "stored non-lemma" is a wordform in the database that is NOT a lemma.
    """
    # "S/he would tell us stories."
    lemma_str = "âcimêw"
    query = "ê-kî-âcimikoyâhk"
    search_results = Wordform.search(query)

    assert len(search_results) >= 1

    exact_matches = [
        result for result in search_results if result.matched_cree == query
    ]
    assert len(exact_matches) >= 1

    # Let's look at that search result in more detail
    result = exact_matches[0]
    assert not result.is_lemma
    assert result.lemma_wordform.text == lemma_str
    # todo: tags are not implemented
    # assert not result.preverbs
    # assert not result.reduplication_tags
    # assert not result.initial_change_tags
    assert len(result.lemma_wordform.definitions.all()) >= 1
    assert all(
        len(dfn.source_ids) >= 1
        for dfn in result.lemma_wordform.definitions.all())
Example #2
0
def test_search_for_exact_lemma(lemma: Wordform):
    """
    Check that we get a search result that matches the exact query.
    """

    assert lemma.is_lemma
    lemma_from_analysis, _, _ = lemma.analysis.partition("+")
    assert all(c == c.lower() for c in lemma_from_analysis)
    assume(lemma.text == lemma_from_analysis)

    query = lemma.text
    search_results = Wordform.search(query)

    exact_matches = {
        result
        for result in search_results
        if result.is_lemma and result.lemma_wordform == lemma
    }
    assert len(exact_matches) == 1

    # Let's look at that search result in more detail
    exact_match = exact_matches.pop()
    assert exact_match.matched_cree == lemma.text
    assert not exact_match.preverbs
    assert not exact_match.reduplication_tags
    assert not exact_match.initial_change_tags
Example #3
0
def test_lemma_and_syncretic_form_ranking(lemma):
    """
    Tests that the lemma is always shown first, even when a search yields
    one or more forms that are syncretic with the lemma; That is, ensure THIS
    doesn't happen:

        sheep [Plural]
        form of sheep [Singular]

        (no definition found for sheep [Plural])

        sheep [Singular]
        1. a fluffy mammal that appears in dreams

    Note: this test is likely to be **FLAKY** if the implementation is buggy
    and uses a **non-stable** sort or comparison.
    """

    results = Wordform.search(lemma)
    assert len(results) >= 2
    maskwa_results = [
        res for res in results if res.lemma_wordform.text == lemma
    ]
    assert len(maskwa_results) >= 2
    assert any(res.is_lemma for res in maskwa_results)
    first_result = maskwa_results[0]
    assert first_result.is_lemma, f"unexpected first result: {first_result}"
def index(request):  # pragma: no cover
    """
    homepage with optional initial search results to display

    :param request:
    :param query_string: optional initial search results to display
    :return:
    """

    user_query = request.GET.get("q", None)

    if user_query:
        search_results = [
            search_result.serialize() for search_result in Wordform.search(user_query)
        ]
        did_search = True
    else:
        search_results = []
        did_search = False

    context = {
        "word_search_form": WordSearchForm(),
        # when we have initial query word to search and display
        "query_string": user_query,
        "search_results": search_results,
        "did_search": did_search,
    }
    return HttpResponse(render(request, "CreeDictionary/index.html", context))
Example #5
0
def test_search_for_pronoun() -> None:
    """
    Search for a common pronoun "ôma". Make sure "oma" returns at least one
    result that says "ôma"
    """

    search_results = Wordform.search("oma")
    assert "ôma" in {res.matched_cree for res in search_results}
Example #6
0
def test_search_for_english() -> None:
    """
    Search for a word that is definitely in English.
    """

    # This should match "âcimowin" and related words:
    search_results = Wordform.search("story")

    assert search_results[0].matched_by == Language.ENGLISH
Example #7
0
def test_search_words_with_preverbs():
    """
    preverbs should be extracted and present in SearchResult instances
    """
    results = Wordform.search("nitawi-nipâw")
    assert len(results) == 1
    search_result = results.pop()

    assert len(search_result.preverbs) == 1
    assert search_result.preverbs[0].text == "nitawi-"
Example #8
0
def test_when_linguistic_breakdown_absent():
    # pê- is a preverb
    # it's not analyzable by the fst and should not have a linguistic breakdown

    query = "pe-"
    search_results = Wordform.search(query)

    assert len(search_results) == 1

    result = search_results[0]
    assert (result.linguistic_breakdown_head == ()
            and result.linguistic_breakdown_tail == ())
Example #9
0
def test_search_text_with_ambiguous_word_classes():
    """
    Results of all word classes should be searched when the query is ambiguous
    """
    # pipon can be viewed as a Verb as well as a Noun
    results = Wordform.search("pipon")
    assert {
        r.lemma_wordform.pos
        for r in results if r.matched_cree == "pipon"
    } == {
        "N",
        "V",
    }
def search_results(request, query_string: str):  # pragma: no cover
    """
    returns rendered boxes of search results according to user query
    """
    results = Wordform.search(query_string)
    return render(
        request,
        "CreeDictionary/word-entries.html",
        {
            "query_string": query_string,
            "search_results": [r.serialize() for r in results],
        },
    )
Example #11
0
def test_search_serialization_json_parsable(query):
    """
    Test SearchResult.serialize produces json compatible results
    """
    results = Wordform.search(query)
    for result in results:

        serialized = result.serialize()
        try:
            json.dumps(serialized)
        except Exception as e:
            print(e)
            pytest.fail(
                "SearchResult.serialized method failed to be json compatible")
Example #12
0
def test_lemma_ranking_most_frequent_word():
    # the English sleep should many cree words. But nipâw should show first because
    # it undoubtedly has the highest frequency
    results = Wordform.search("sleep")
    assert results[0].matched_cree == "nipâw"
def import_xmls(dir_name: Path, multi_processing: int = 1, verbose=True):
    r"""
    Import from crkeng files, `dir_name` can host a series of xml files. The latest timestamped files will be
    used, with un-timestamped files as a fallback.

    :param multi_processing: Use multiple hfstol processes to speed up importing
    :param dir_name: the directory that has pattern crkeng.*?(?P<timestamp>\d{6})?\.xml
    (e.g. crkeng_cw_md_200319.xml or crkeng.xml) files, beware the timestamp has format yymmdd
    :param verbose: print to stdout or not
    """
    logger.set_print_info_on_console(verbose)

    crkeng_file_path = find_latest_xml_file(dir_name)
    logger.info(f"using crkeng file: {crkeng_file_path}")

    assert crkeng_file_path.exists()

    with open(crkeng_file_path) as f:
        crkeng_xml = IndexedXML.from_xml_file(f)

    source_abbreviations = crkeng_xml.source_abbreviations

    logger.info("Sources parsed: %r", source_abbreviations)
    for source_abbreviation in source_abbreviations:
        src = DictionarySource(abbrv=source_abbreviation)
        src.save()
        logger.info("Created source: %s", source_abbreviation)

    # these two will be imported to the database
    (
        identified_entry_to_analysis,
        as_is_entries,
    ) = xml_entry_lemma_finder.identify_entries(crkeng_xml, multi_processing)

    logger.info("Structuring wordforms, english keywords, and definition objects...")

    wordform_counter = 1
    definition_counter = 1
    keyword_counter = 1

    def generate_english_keywords(
        wordform: Wordform, translation: XMLTranslation
    ) -> List[EnglishKeyword]:
        """
        MUTATES keyword_counter!!!!!!!!!

        Returns a list of EnglishKeyword instances parsed from the translation text.
        """
        nonlocal keyword_counter

        keywords = [
            EnglishKeyword(id=unique_id, text=english_keyword, lemma=wordform)
            for unique_id, english_keyword in enumerate(
                stem_keywords(translation.text), start=keyword_counter
            )
        ]
        keyword_counter += len(keywords)
        return keywords

    db_inflections: List[Wordform] = []
    db_definitions: List[Definition] = []
    db_keywords: List[EnglishKeyword] = []
    citations: Dict[int, Set[str]] = {}

    # now we import as is entries to the database, the entries that we fail to provide an lemma analysis.
    for entry in as_is_entries:
        upper_pos = entry.pos.upper()
        wordform_dict = dict(
            id=wordform_counter,
            text=entry.l,
            analysis=generate_as_is_analysis(entry.l, entry.pos, entry.ic),
            pos=upper_pos if upper_pos in RECOGNIZABLE_POS else "",
            inflectional_category=entry.ic,
            is_lemma=True,  # is_lemma field should be true for as_is entries
            as_is=True,
        )
        if entry.stem is not None:
            wordform_dict["stem"] = entry.stem

        db_wordform = Wordform(**wordform_dict)

        # Insert keywords for as-is entries
        for translation in entry.translations:
            db_keywords.extend(generate_english_keywords(db_wordform, translation))

        db_wordform.lemma = db_wordform

        wordform_counter += 1
        db_inflections.append(db_wordform)

        for str_definition, source_strings in entry.translations:

            db_definition = Definition(
                id=definition_counter, text=str_definition, wordform=db_wordform
            )

            # Figure out what citations we should be making.
            assert definition_counter not in citations
            citations[definition_counter] = set(source_strings)

            definition_counter += 1
            db_definitions.append(db_definition)

    # generate ALL inflections within the paradigm tables from the lemma analysis
    expanded = expand_inflections(
        identified_entry_to_analysis.values(), multi_processing
    )

    # now we import identified entries to the database, the entries we successfully identify with their lemma analyses
    for (entry, lemma_analysis) in identified_entry_to_analysis.items():
        lemma_text_and_word_class = (
            fst_analysis_parser.extract_lemma_text_and_word_class(lemma_analysis)
        )
        assert lemma_text_and_word_class is not None

        fst_lemma_text, word_class = lemma_text_and_word_class
        generated_pos = word_class.pos

        db_wordforms_for_analysis = []
        db_lemma = None

        # build wordforms and definition in db
        for generated_analysis, generated_wordform_texts in expanded[lemma_analysis]:

            generated_lemma_text_and_ic = (
                fst_analysis_parser.extract_lemma_text_and_word_class(
                    generated_analysis
                )
            )
            assert generated_lemma_text_and_ic is not None
            generated_lemma_text, generated_ic = generated_lemma_text_and_ic

            for generated_wordform_text in generated_wordform_texts:
                # generated_inflections contain different spellings of one fst analysis
                if (
                    generated_wordform_text == fst_lemma_text
                    and generated_analysis == lemma_analysis
                ):
                    is_lemma = True
                else:
                    is_lemma = False
                wordform_dict = dict(
                    id=wordform_counter,
                    text=generated_wordform_text,
                    analysis=generated_analysis,
                    is_lemma=is_lemma,
                    pos=generated_pos.name,
                    inflectional_category=entry.ic,
                    as_is=False,
                )
                if entry.stem is not None:
                    wordform_dict["stem"] = entry.stem
                db_wordform = Wordform(**wordform_dict)

                db_wordforms_for_analysis.append(db_wordform)
                wordform_counter += 1
                db_inflections.append(db_wordform)

                if is_lemma:
                    db_lemma = db_wordform

                # now we create definition for all (possibly non-lemma) entries in the xml that are forms of this lemma.

                # try to match our generated wordform to entries in the xml file,
                # in order to get its translation from the entries
                entries_with_translations: List[XMLEntry] = []

                # first get homographic entries from the xml file
                homographic_entries = crkeng_xml.filter(l=generated_wordform_text)

                # The case when we do have homographic entries in xml,
                # Then we check whether these entries' pos and ic agrees with our generated wordform
                if len(homographic_entries) > 0:
                    for homographic_entry in homographic_entries:
                        if does_inflectional_category_match_xml_entry(
                            generated_ic, homographic_entry.pos, homographic_entry.ic
                        ):
                            entries_with_translations.append(homographic_entry)

                # The case when we don't have homographic entries in xml,
                # The generated inflection doesn't have a definition

                for entry_with_translation in entries_with_translations:

                    for translation in entry_with_translation.translations:
                        db_definition = Definition(
                            id=definition_counter,
                            text=translation.text,
                            wordform=db_wordform,
                        )
                        assert definition_counter not in citations
                        citations[definition_counter] = set(translation.sources)

                        definition_counter += 1
                        db_definitions.append(db_definition)

                        db_keywords.extend(
                            generate_english_keywords(db_wordform, translation)
                        )

        assert db_lemma is not None
        for wordform in db_wordforms_for_analysis:
            wordform.lemma = db_lemma

    logger.info("Inserting %d inflections to database..." % len(db_inflections))
    Wordform.objects.bulk_create(db_inflections)
    logger.info("Done inserting.")

    logger.info("Inserting definition to database...")
    Definition.objects.bulk_create(db_definitions)
    logger.info("Done inserting.")

    logger.info("Inserting citations [definition -> dictionary source] to database...")
    # ThroughModel is the "hidden" model that manages the Many-to-Many
    # relationship
    ThroughModel = Definition.citations.through

    def _generate_through_models():
        "Yields all associations between Definitions and DictionarySources"
        for dfn_id, src_ids in citations.items():
            for src_pk in src_ids:
                yield ThroughModel(definition_id=dfn_id, dictionarysource_id=src_pk)

    ThroughModel.objects.bulk_create(_generate_through_models())
    logger.info("Done inserting.")

    logger.info("Inserting English keywords to database...")
    EnglishKeyword.objects.bulk_create(db_keywords)
    logger.info("Done inserting.")

    # Convert the sources (stored as a string) to citations
    # The reason why this is not done in the first place and there is a conversion:
    #   django's efficient `bulk_create` method we use above doesn't play well with ManyToManyField
    for dfn in Definition.objects.all():
        source_ids = sorted(source.abbrv for source in dfn.citations.all())
        for source_id in source_ids:
            dfn.citations.add(source_id)
        dfn.save()