def test_search_for_stored_non_lemma(): """ A "stored non-lemma" is a wordform in the database that is NOT a lemma. """ # "S/he would tell us stories." lemma_str = "âcimêw" query = "ê-kî-âcimikoyâhk" search_results = Wordform.search(query) assert len(search_results) >= 1 exact_matches = [ result for result in search_results if result.matched_cree == query ] assert len(exact_matches) >= 1 # Let's look at that search result in more detail result = exact_matches[0] assert not result.is_lemma assert result.lemma_wordform.text == lemma_str # todo: tags are not implemented # assert not result.preverbs # assert not result.reduplication_tags # assert not result.initial_change_tags assert len(result.lemma_wordform.definitions.all()) >= 1 assert all( len(dfn.source_ids) >= 1 for dfn in result.lemma_wordform.definitions.all())
def test_search_for_exact_lemma(lemma: Wordform): """ Check that we get a search result that matches the exact query. """ assert lemma.is_lemma lemma_from_analysis, _, _ = lemma.analysis.partition("+") assert all(c == c.lower() for c in lemma_from_analysis) assume(lemma.text == lemma_from_analysis) query = lemma.text search_results = Wordform.search(query) exact_matches = { result for result in search_results if result.is_lemma and result.lemma_wordform == lemma } assert len(exact_matches) == 1 # Let's look at that search result in more detail exact_match = exact_matches.pop() assert exact_match.matched_cree == lemma.text assert not exact_match.preverbs assert not exact_match.reduplication_tags assert not exact_match.initial_change_tags
def test_lemma_and_syncretic_form_ranking(lemma): """ Tests that the lemma is always shown first, even when a search yields one or more forms that are syncretic with the lemma; That is, ensure THIS doesn't happen: sheep [Plural] form of sheep [Singular] (no definition found for sheep [Plural]) sheep [Singular] 1. a fluffy mammal that appears in dreams Note: this test is likely to be **FLAKY** if the implementation is buggy and uses a **non-stable** sort or comparison. """ results = Wordform.search(lemma) assert len(results) >= 2 maskwa_results = [ res for res in results if res.lemma_wordform.text == lemma ] assert len(maskwa_results) >= 2 assert any(res.is_lemma for res in maskwa_results) first_result = maskwa_results[0] assert first_result.is_lemma, f"unexpected first result: {first_result}"
def index(request): # pragma: no cover """ homepage with optional initial search results to display :param request: :param query_string: optional initial search results to display :return: """ user_query = request.GET.get("q", None) if user_query: search_results = [ search_result.serialize() for search_result in Wordform.search(user_query) ] did_search = True else: search_results = [] did_search = False context = { "word_search_form": WordSearchForm(), # when we have initial query word to search and display "query_string": user_query, "search_results": search_results, "did_search": did_search, } return HttpResponse(render(request, "CreeDictionary/index.html", context))
def test_search_for_pronoun() -> None: """ Search for a common pronoun "ôma". Make sure "oma" returns at least one result that says "ôma" """ search_results = Wordform.search("oma") assert "ôma" in {res.matched_cree for res in search_results}
def test_search_for_english() -> None: """ Search for a word that is definitely in English. """ # This should match "âcimowin" and related words: search_results = Wordform.search("story") assert search_results[0].matched_by == Language.ENGLISH
def test_search_words_with_preverbs(): """ preverbs should be extracted and present in SearchResult instances """ results = Wordform.search("nitawi-nipâw") assert len(results) == 1 search_result = results.pop() assert len(search_result.preverbs) == 1 assert search_result.preverbs[0].text == "nitawi-"
def test_when_linguistic_breakdown_absent(): # pê- is a preverb # it's not analyzable by the fst and should not have a linguistic breakdown query = "pe-" search_results = Wordform.search(query) assert len(search_results) == 1 result = search_results[0] assert (result.linguistic_breakdown_head == () and result.linguistic_breakdown_tail == ())
def test_search_text_with_ambiguous_word_classes(): """ Results of all word classes should be searched when the query is ambiguous """ # pipon can be viewed as a Verb as well as a Noun results = Wordform.search("pipon") assert { r.lemma_wordform.pos for r in results if r.matched_cree == "pipon" } == { "N", "V", }
def search_results(request, query_string: str): # pragma: no cover """ returns rendered boxes of search results according to user query """ results = Wordform.search(query_string) return render( request, "CreeDictionary/word-entries.html", { "query_string": query_string, "search_results": [r.serialize() for r in results], }, )
def test_search_serialization_json_parsable(query): """ Test SearchResult.serialize produces json compatible results """ results = Wordform.search(query) for result in results: serialized = result.serialize() try: json.dumps(serialized) except Exception as e: print(e) pytest.fail( "SearchResult.serialized method failed to be json compatible")
def test_lemma_ranking_most_frequent_word(): # the English sleep should many cree words. But nipâw should show first because # it undoubtedly has the highest frequency results = Wordform.search("sleep") assert results[0].matched_cree == "nipâw"
def import_xmls(dir_name: Path, multi_processing: int = 1, verbose=True): r""" Import from crkeng files, `dir_name` can host a series of xml files. The latest timestamped files will be used, with un-timestamped files as a fallback. :param multi_processing: Use multiple hfstol processes to speed up importing :param dir_name: the directory that has pattern crkeng.*?(?P<timestamp>\d{6})?\.xml (e.g. crkeng_cw_md_200319.xml or crkeng.xml) files, beware the timestamp has format yymmdd :param verbose: print to stdout or not """ logger.set_print_info_on_console(verbose) crkeng_file_path = find_latest_xml_file(dir_name) logger.info(f"using crkeng file: {crkeng_file_path}") assert crkeng_file_path.exists() with open(crkeng_file_path) as f: crkeng_xml = IndexedXML.from_xml_file(f) source_abbreviations = crkeng_xml.source_abbreviations logger.info("Sources parsed: %r", source_abbreviations) for source_abbreviation in source_abbreviations: src = DictionarySource(abbrv=source_abbreviation) src.save() logger.info("Created source: %s", source_abbreviation) # these two will be imported to the database ( identified_entry_to_analysis, as_is_entries, ) = xml_entry_lemma_finder.identify_entries(crkeng_xml, multi_processing) logger.info("Structuring wordforms, english keywords, and definition objects...") wordform_counter = 1 definition_counter = 1 keyword_counter = 1 def generate_english_keywords( wordform: Wordform, translation: XMLTranslation ) -> List[EnglishKeyword]: """ MUTATES keyword_counter!!!!!!!!! Returns a list of EnglishKeyword instances parsed from the translation text. """ nonlocal keyword_counter keywords = [ EnglishKeyword(id=unique_id, text=english_keyword, lemma=wordform) for unique_id, english_keyword in enumerate( stem_keywords(translation.text), start=keyword_counter ) ] keyword_counter += len(keywords) return keywords db_inflections: List[Wordform] = [] db_definitions: List[Definition] = [] db_keywords: List[EnglishKeyword] = [] citations: Dict[int, Set[str]] = {} # now we import as is entries to the database, the entries that we fail to provide an lemma analysis. for entry in as_is_entries: upper_pos = entry.pos.upper() wordform_dict = dict( id=wordform_counter, text=entry.l, analysis=generate_as_is_analysis(entry.l, entry.pos, entry.ic), pos=upper_pos if upper_pos in RECOGNIZABLE_POS else "", inflectional_category=entry.ic, is_lemma=True, # is_lemma field should be true for as_is entries as_is=True, ) if entry.stem is not None: wordform_dict["stem"] = entry.stem db_wordform = Wordform(**wordform_dict) # Insert keywords for as-is entries for translation in entry.translations: db_keywords.extend(generate_english_keywords(db_wordform, translation)) db_wordform.lemma = db_wordform wordform_counter += 1 db_inflections.append(db_wordform) for str_definition, source_strings in entry.translations: db_definition = Definition( id=definition_counter, text=str_definition, wordform=db_wordform ) # Figure out what citations we should be making. assert definition_counter not in citations citations[definition_counter] = set(source_strings) definition_counter += 1 db_definitions.append(db_definition) # generate ALL inflections within the paradigm tables from the lemma analysis expanded = expand_inflections( identified_entry_to_analysis.values(), multi_processing ) # now we import identified entries to the database, the entries we successfully identify with their lemma analyses for (entry, lemma_analysis) in identified_entry_to_analysis.items(): lemma_text_and_word_class = ( fst_analysis_parser.extract_lemma_text_and_word_class(lemma_analysis) ) assert lemma_text_and_word_class is not None fst_lemma_text, word_class = lemma_text_and_word_class generated_pos = word_class.pos db_wordforms_for_analysis = [] db_lemma = None # build wordforms and definition in db for generated_analysis, generated_wordform_texts in expanded[lemma_analysis]: generated_lemma_text_and_ic = ( fst_analysis_parser.extract_lemma_text_and_word_class( generated_analysis ) ) assert generated_lemma_text_and_ic is not None generated_lemma_text, generated_ic = generated_lemma_text_and_ic for generated_wordform_text in generated_wordform_texts: # generated_inflections contain different spellings of one fst analysis if ( generated_wordform_text == fst_lemma_text and generated_analysis == lemma_analysis ): is_lemma = True else: is_lemma = False wordform_dict = dict( id=wordform_counter, text=generated_wordform_text, analysis=generated_analysis, is_lemma=is_lemma, pos=generated_pos.name, inflectional_category=entry.ic, as_is=False, ) if entry.stem is not None: wordform_dict["stem"] = entry.stem db_wordform = Wordform(**wordform_dict) db_wordforms_for_analysis.append(db_wordform) wordform_counter += 1 db_inflections.append(db_wordform) if is_lemma: db_lemma = db_wordform # now we create definition for all (possibly non-lemma) entries in the xml that are forms of this lemma. # try to match our generated wordform to entries in the xml file, # in order to get its translation from the entries entries_with_translations: List[XMLEntry] = [] # first get homographic entries from the xml file homographic_entries = crkeng_xml.filter(l=generated_wordform_text) # The case when we do have homographic entries in xml, # Then we check whether these entries' pos and ic agrees with our generated wordform if len(homographic_entries) > 0: for homographic_entry in homographic_entries: if does_inflectional_category_match_xml_entry( generated_ic, homographic_entry.pos, homographic_entry.ic ): entries_with_translations.append(homographic_entry) # The case when we don't have homographic entries in xml, # The generated inflection doesn't have a definition for entry_with_translation in entries_with_translations: for translation in entry_with_translation.translations: db_definition = Definition( id=definition_counter, text=translation.text, wordform=db_wordform, ) assert definition_counter not in citations citations[definition_counter] = set(translation.sources) definition_counter += 1 db_definitions.append(db_definition) db_keywords.extend( generate_english_keywords(db_wordform, translation) ) assert db_lemma is not None for wordform in db_wordforms_for_analysis: wordform.lemma = db_lemma logger.info("Inserting %d inflections to database..." % len(db_inflections)) Wordform.objects.bulk_create(db_inflections) logger.info("Done inserting.") logger.info("Inserting definition to database...") Definition.objects.bulk_create(db_definitions) logger.info("Done inserting.") logger.info("Inserting citations [definition -> dictionary source] to database...") # ThroughModel is the "hidden" model that manages the Many-to-Many # relationship ThroughModel = Definition.citations.through def _generate_through_models(): "Yields all associations between Definitions and DictionarySources" for dfn_id, src_ids in citations.items(): for src_pk in src_ids: yield ThroughModel(definition_id=dfn_id, dictionarysource_id=src_pk) ThroughModel.objects.bulk_create(_generate_through_models()) logger.info("Done inserting.") logger.info("Inserting English keywords to database...") EnglishKeyword.objects.bulk_create(db_keywords) logger.info("Done inserting.") # Convert the sources (stored as a string) to citations # The reason why this is not done in the first place and there is a conversion: # django's efficient `bulk_create` method we use above doesn't play well with ManyToManyField for dfn in Definition.objects.all(): source_ids = sorted(source.abbrv for source in dfn.citations.all()) for source_id in source_ids: dfn.citations.add(source_id) dfn.save()