Exemple #1
0
def get_variants(word):
    """Given a word, return a list of every possible legitimate
    spelling of it, in every tense, declension and writing system.

    We generate all our variants in lower case. Since we search over
    the variants, this gives us a case insensive search.

    """
    # the word itself is a variant
    word = word.lower()
    variants = [word]

    # every tense and declension
    if is_infinitive(word):
        root = word[:-1]
        variants.extend([root + 'is', root + 'as', root + 'os', 
                          root + 'us', root + 'u'])
    elif is_declinable_adjective(word):
        variants.extend([word + 'j', word + 'n', word + 'jn'])
    elif is_declinable_noun(word):
        variants.extend([word + 'j', word + 'n', word + 'jn'])
    elif is_declinable_adverb(word):
        variants.extend([word + 'n'])
    elif is_pronoun(word):
        variants.extend([word + 'n'])

    # add additional variants if they are different in other writing
    # systems
    different = []
    for word in variants:
        if word != to_x_system(word):
            different.append(to_x_system(word))
        if word != to_h_system(word):
            different.append(to_h_system(word))

    variants.extend(different)

    return variants
Exemple #2
0
def populate_database(dictionary):
    """Given a dictionary file from a JSON dump created by
    ReVo-utilities, write its contents to the database.

    We only commit once because it would take hours if we commit every
    object separately.

    """

    # no duplicate morphemes
    seen_morphemes = {}

    for (word, entry) in dictionary.items():

        word_obj = Word(word=word)
        word_obj.save()

        # variants (case/declension/tense)
        for variant in get_variants(word):
            Variant(word=word_obj, variant=variant).save()

        # add every definition
        # note this means that the order of definition_id corresponds
        # to the order of the definitions from ReVo, which is important
        for definition_dict in entry['definitions']:
            definition = definition_dict['primary definition']
            definition_obj = PrimaryDefinition(definition=definition,
                                               word=word_obj)
            definition_obj.save()

            # subdefinitions belonging to this definition
            for subdefinition_dict in definition_dict['subdefinitions']:
                subdefinition = subdefinition_dict['primary definition']
                subdefinition_obj = Subdefinition(definition=subdefinition,
                                                  root_definition=definition_obj)
                subdefinition_obj.save()

                # now all examples associated with this subdefinition
                for (example, source) in subdefinition_dict['examples']:
                    Example(definition=subdefinition_obj,
                            example=example, source=source).save()

                # all translations associated with this subdefinition
                for (language_code, translations) in subdefinition_dict['translations'].items():
                    for translation in translations:
                        Translation(word=word_obj, definition=subdefinition_obj,
                                    translation=translation,
                                    language_code=language_code).save()

            # examples belonging to this definition
            for (example, source) in definition_dict['examples']:
                Example(definition=definition_obj, example=example, 
                        source=source).save()

            # remarks belonging to this definition
            for remark in definition_dict['remarks']:
                Remark(definition=definition_obj, remark=remark).save()

            # words in other languages which have the same meaning
            for (language_code, translations) in definition_dict['translations'].items():
                for translation in translations:
                    Translation(word=word_obj, definition=definition_obj,
                                translation=translation,
                                language_code=language_code).save()

        # add morphemes to initial data
        if entry['primary']:
            """Primary means we will link to this word when we find
            the morpheme. For example, we link 'dorm' to 'dormi'
            although 'dormo' is also in the dictionary. 
            
            """
            # Add roots (e.g. 'dorm'), forbidding those of one
            # letter since none actually exist in word building.
            root = entry['root']
            if len(root) > 1:
                for spelling in get_all_spellings(root):
                    if spelling not in seen_morphemes:
                        seen_morphemes[spelling] = True
                        Morpheme.objects.create(primary_word=word_obj, morpheme=spelling)
            
        # also add words as morphemes if they end -o or -a
        if (is_declinable_noun(word) or is_declinable_adjective(word) or
            is_declinable_adverb(word)):
            
            for spelling in get_all_spellings(word):
                if spelling not in seen_morphemes:
                    seen_morphemes[spelling] = True
                    Morpheme.objects.create(primary_word=word_obj, morpheme=spelling)

    # add -ant, etc morphemes which aren't in ReVo
    assert 'ant' not in seen_morphemes
    for morpheme in ['int', 'ant', 'ont', 'unt']:
        Morpheme(morpheme=morpheme).save()

    assert 'at' not in seen_morphemes
    for morpheme in ['it', 'at', 'ot']:
        Morpheme(morpheme=morpheme).save()