コード例 #1
0
ファイル: populatedb.py プロジェクト: necrop/wordrobot
def populate_language():
    """
    Populate the Language table
    """
    taxonomy = LanguageTaxonomy()
    taxonomy.families = set(buildconfig.LANGUAGE_FAMILIES)

    max_length = Language._meta.get_field("name").max_length

    language_objects = []
    for language in taxonomy.languages():
        name = language.name[:max_length]
        language_objects.append(Language(id=language.id, name=name, family=None))
    Language.objects.bulk_create(language_objects)

    for language in taxonomy.languages():
        family = taxonomy.family_of(language.name)
        if family is not None:
            src = Language.objects.get(id=language.id)
            target = Language.objects.get(id=family.id)
            src.family = target
            src.save()
コード例 #2
0
ファイル: populatedb.py プロジェクト: necrop/wordrobot
def populate_lexical():
    """
    Populate the Lemma, Wordform, and Definition tables
    """
    in_dir = os.path.join(buildconfig.FORM_INDEX_DIR, "refined")
    frequency_cutoff = buildconfig.FREQUENCY_CUTOFF

    taxonomy = LanguageTaxonomy()
    lemma_counter = 0
    definition_counter = 0

    for letter in string.ascii_lowercase:
        stdout.write("Inserting data for %s...\n" % letter)
        blocks = []
        in_file = os.path.join(in_dir, letter + ".json")
        with open(in_file, "r") as filehandle:
            for line in filehandle:
                data = json.loads(line.strip())
                blocks.append(BlockData(*data))

        lemmas = []
        wordforms = []
        definitions = []
        for i, block in enumerate(blocks):
            lang_node = taxonomy.node(language=block.language)
            if lang_node is None:
                language_id = None
            else:
                language_id = lang_node.id

            if block.definition and block.f2000 < frequency_cutoff:
                definition_counter += 1
                definitions.append(Definition(id=definition_counter, text=block.definition[:100]))
                definition_id = definition_counter
            else:
                definition_id = None

            lemma_counter += 1
            lemmas.append(
                Lemma(
                    id=lemma_counter,
                    lemma=block.lemma,
                    sort=block.sort,
                    wordclass=block.wordclass,
                    firstyear=block.start,
                    lastyear=block.end,
                    refentry=block.refentry,
                    refid=block.refid,
                    thesaurus_id=block.htlink,
                    language_id=language_id,
                    definition_id=definition_id,
                    f2000=_rounder(block.f2000),
                    f1950=_rounder(block.f1950),
                    f1900=_rounder(block.f1900),
                    f1850=_rounder(block.f1850),
                    f1800=_rounder(block.f1800),
                    f1750=_rounder(block.f1750),
                )
            )

            for typelist in (block.standard_types, block.variant_types, block.alien_types):
                for typeunit in typelist:
                    wordforms.append(
                        Wordform(
                            sort=typeunit[0],
                            wordform=typeunit[1],
                            wordclass=typeunit[2],
                            lemma_id=lemma_counter,
                            f2000=_rounder(typeunit[4]),
                            f1900=_rounder(typeunit[5]),
                            f1800=_rounder(typeunit[6]),
                        )
                    )

            if i % 1000 == 0:
                Definition.objects.bulk_create(definitions)
                Lemma.objects.bulk_create(lemmas)
                Wordform.objects.bulk_create(wordforms)
                definitions = []
                lemmas = []
                wordforms = []

        Definition.objects.bulk_create(definitions)
        Lemma.objects.bulk_create(lemmas)
        Wordform.objects.bulk_create(wordforms)