def get_variants(word): """Given a word, return a list of every possible legitimate spelling of it, in every tense, declension and writing system. We generate all our variants in lower case. Since we search over the variants, this gives us a case insensive search. """ # the word itself is a variant word = word.lower() variants = [word] # every tense and declension if is_infinitive(word): root = word[:-1] variants.extend([root + 'is', root + 'as', root + 'os', root + 'us', root + 'u']) elif is_declinable_adjective(word): variants.extend([word + 'j', word + 'n', word + 'jn']) elif is_declinable_noun(word): variants.extend([word + 'j', word + 'n', word + 'jn']) elif is_declinable_adverb(word): variants.extend([word + 'n']) elif is_pronoun(word): variants.extend([word + 'n']) # add additional variants if they are different in other writing # systems different = [] for word in variants: if word != to_x_system(word): different.append(to_x_system(word)) if word != to_h_system(word): different.append(to_h_system(word)) variants.extend(different) return variants
def populate_database(dictionary): """Given a dictionary file from a JSON dump created by ReVo-utilities, write its contents to the database. We only commit once because it would take hours if we commit every object separately. """ # no duplicate morphemes seen_morphemes = {} for (word, entry) in dictionary.items(): word_obj = Word(word=word) word_obj.save() # variants (case/declension/tense) for variant in get_variants(word): Variant(word=word_obj, variant=variant).save() # add every definition # note this means that the order of definition_id corresponds # to the order of the definitions from ReVo, which is important for definition_dict in entry['definitions']: definition = definition_dict['primary definition'] definition_obj = PrimaryDefinition(definition=definition, word=word_obj) definition_obj.save() # subdefinitions belonging to this definition for subdefinition_dict in definition_dict['subdefinitions']: subdefinition = subdefinition_dict['primary definition'] subdefinition_obj = Subdefinition(definition=subdefinition, root_definition=definition_obj) subdefinition_obj.save() # now all examples associated with this subdefinition for (example, source) in subdefinition_dict['examples']: Example(definition=subdefinition_obj, example=example, source=source).save() # all translations associated with this subdefinition for (language_code, translations) in subdefinition_dict['translations'].items(): for translation in translations: Translation(word=word_obj, definition=subdefinition_obj, translation=translation, language_code=language_code).save() # examples belonging to this definition for (example, source) in definition_dict['examples']: Example(definition=definition_obj, example=example, source=source).save() # remarks belonging to this definition for remark in definition_dict['remarks']: Remark(definition=definition_obj, remark=remark).save() # words in other languages which have the same meaning for (language_code, translations) in definition_dict['translations'].items(): for translation in translations: Translation(word=word_obj, definition=definition_obj, translation=translation, language_code=language_code).save() # add morphemes to initial data if entry['primary']: """Primary means we will link to this word when we find the morpheme. For example, we link 'dorm' to 'dormi' although 'dormo' is also in the dictionary. """ # Add roots (e.g. 'dorm'), forbidding those of one # letter since none actually exist in word building. root = entry['root'] if len(root) > 1: for spelling in get_all_spellings(root): if spelling not in seen_morphemes: seen_morphemes[spelling] = True Morpheme.objects.create(primary_word=word_obj, morpheme=spelling) # also add words as morphemes if they end -o or -a if (is_declinable_noun(word) or is_declinable_adjective(word) or is_declinable_adverb(word)): for spelling in get_all_spellings(word): if spelling not in seen_morphemes: seen_morphemes[spelling] = True Morpheme.objects.create(primary_word=word_obj, morpheme=spelling) # add -ant, etc morphemes which aren't in ReVo assert 'ant' not in seen_morphemes for morpheme in ['int', 'ant', 'ont', 'unt']: Morpheme(morpheme=morpheme).save() assert 'at' not in seen_morphemes for morpheme in ['it', 'at', 'ot']: Morpheme(morpheme=morpheme).save()