Esempio n. 1
0
def add_pronouns(ctx):
    """Populates `PronounStem` and `Pronoun`."""

    session = ctx.session
    gender_group = ENUM['gender_group']
    gender = ENUM['gender']
    case = ENUM['case']
    number = ENUM['number']

    seen_stems = {}  # (stem, genders_id) -> id
    for row in util.read_csv(ctx.config['PRONOUNS']):
        stem = row['stem']
        genders_id = gender_group[row['stem_genders']]

        if (stem, genders_id) not in seen_stems:
            pronoun_stem = PronounStem(name=stem, genders_id=genders_id)
            session.add(pronoun_stem)
            session.flush()
            seen_stems[(stem, genders_id)] = pronoun_stem.id
            util.tick(stem)

        stem_id = seen_stems[(stem, genders_id)]
        session.add(
            Nominal(stem_id=stem_id,
                    name=row['form'],
                    gender_id=gender[row['form_gender']],
                    case_id=case[row['case']],
                    number_id=number[row['number']]))
        session.flush()

    session.commit()
    session.close()
Esempio n. 2
0
def add_nominal_endings(ctx):
    """Populates `NominalEnding`."""
    session = ctx.session
    gender = ENUM['gender']
    case = ENUM['case']
    number = ENUM['number']

    for row in util.read_csv(ctx.config['COMPOUNDED_NOMINAL_ENDINGS']):
        ending = NominalEnding(name=row['ending'],
                               stem_type=row['stem_type'],
                               gender_id=gender[row['form_gender']],
                               case_id=None,
                               number_id=None,
                               compounded=True)
        session.add(ending)

    seen = set()
    for row in util.read_csv(ctx.config['INFLECTED_NOMINAL_ENDINGS']):
        ending = NominalEnding(name=row['ending'],
                               stem_type=row['stem_type'],
                               gender_id=gender[row['form_gender']],
                               case_id=case[row['case']],
                               number_id=number[row['number']],
                               compounded=False)
        session.add(ending)

        if row['stem_type'] not in seen:
            util.tick(row['stem_type'])
            seen.add(row['stem_type'])

    session.commit()
    session.close()
Esempio n. 3
0
def add_verbs(ctx, root_map):
    """Add inflected verbs to the database."""

    session = ctx.session
    vclass = ENUM['vclass']
    person = ENUM['person']
    number = ENUM['number']
    mode = ENUM['mode']
    voice = ENUM['voice']
    skipped = set()
    i = 0

    for row in util.read_csv(ctx.config['VERBS']):
        root = row['root']
        hom = row['hom']

        try:
            root_id = root_map[(root, hom)]
        except KeyError:
            skipped.add((root, hom))
            continue

        data = {
            'name': row['form'],
            'root_id': root_id,
            'vclass_id': vclass[row['class']] if row['class'] else None,
            'person_id': person[row['person']],
            'number_id': number[row['number']],
            'mode_id': mode[row['mode']],
            'voice_id': voice[row['voice']]
        }
        session.add(Verb(**data))

        i += 1
        if i % 1000 == 0:
            util.tick(row['form'])
            session.commit()

    session.commit()
    session.close()
    print('Skipped', len(skipped), 'roots.')
Esempio n. 4
0
def add_irregular_adjectives(ctx):
    """Add regular irregular adjectives to the database."""

    session = ctx.session
    gender = ENUM['gender']
    case = ENUM['case']
    number = ENUM['number']

    with open(ctx.config['IRREGULAR_ADJECTIVES']) as f:
        for adj in yaml.load_all(f):
            stem = AdjectiveStem(name=adj['name'])
            session.add(stem)
            session.flush()

            # Mark the stem as irregular
            complete = adj['complete']
            irreg = StemIrregularity(stem=stem, fully_described=complete)
            session.add(irreg)
            session.flush()

            util.tick(stem.name)

            for form in adj['forms']:
                name = form['name']
                gender_id = gender[form['gender']]
                case_id = case[form['case']]
                number_id = number[form['number']]

                result = Adjective(stem=stem,
                                   name=name,
                                   gender_id=gender_id,
                                   case_id=case_id,
                                   number_id=number_id)
                session.add(result)

    session.commit()
    session.close()
Esempio n. 5
0
def add_participle_stems(ctx):
    """Populates `ParticipleStem`."""

    session = ctx.session
    mode = ENUM['mode']
    voice = ENUM['voice']
    i = 0

    for row in util.read_csv(ctx.config['PARTICIPLE_STEMS']):
        data = {
            'name': row['stem'].split("#")[0],
            'mode_id': mode[row['mode']],
            'voice_id': voice[row['voice']]
        }

        session.add(ParticipleStem(**data))

        i += 1
        if i % 100 == 0:
            util.tick(row['stem'])
            session.commit()

    session.commit()
    session.close()
Esempio n. 6
0
def add_enums(ctx):
    """Add enumerated data to the database. Among others, this includes:

    - persons
    - numbers
    - modes
    - voices
    - genders
    - cases

    and any other data with small, known limits.
    """

    session = ctx.session
    type_to_class = {
        'case': Case,
        'class': VClass,
        'gender': Gender,
        'gender_group': GenderGroup,
        'modification': Modification,
        'mode': Mode,
        'number': Number,
        'person': Person,
        'sandhi_rule_type': SandhiType,
        'voice': Voice,
    }

    # First pass: ordinary enums
    for row in util.read_csv(ctx.config['ENUMS']):
        if row['enum_type'] == 'gender_group':
            continue

        cls = type_to_class.get(row['enum_type'], None)
        # TODO: always non-None?
        if cls is None:
            continue

        enum_abbr = cls.__tablename__
        if enum_abbr not in ENUM:
            util.tick(cls.__name__)
        ENUM.setdefault(enum_abbr, {})

        abbreviation = row['abbreviation']
        e = cls(name=row['human_readable_value'], abbr=abbreviation)
        session.add(e)
        session.flush()
        ENUM[enum_abbr][abbreviation] = e.id
    session.commit()

    # Second pass: gender groups
    for row in util.read_csv(ctx.config['ENUMS']):
        if row['enum_type'] != 'gender_group':
            continue

        cls = type_to_class.get(row['enum_type'], None)
        enum_abbr = cls.__tablename__
        if enum_abbr not in ENUM:
            util.tick(cls.__name__)
        ENUM.setdefault(enum_abbr, {})

        abbreviation = row['abbreviation']
        e = cls(name=row['human_readable_value'], abbr=abbreviation)
        session.add(e)
        session.flush()

        if set(abbreviation).issubset('mfn'):
            e.members = [ENUM['gender'][x] for x in abbreviation]

        ENUM[enum_abbr][abbreviation] = e.id

    session.commit()
    session.close()