Beispiel #1
0
    def setUp(self):
        TestWithDb.setUp(self)

        DBSession.add(
            common.Dataset(id='dataset',
                           name='dataset',
                           description='desc',
                           domain='clld'))

        source = common.Source(id='source')
        contributors = {
            'contributor': 'A Name',
            'b': 'b Name',
            'c': 'c Name',
            'd': 'd Name'
        }
        for id_, name in contributors.items():
            contributors[id_] = common.Contributor(id=id_, name=name)

        contribution = common.Contribution(id='contribution',
                                           name='Contribution')
        cr = common.ContributionReference(contribution=contribution,
                                          source=source)
        assert common.ContributionContributor(
            contribution=contribution,
            primary=True,
            contributor=contributors['contributor'])
        assert common.ContributionContributor(contribution=contribution,
                                              primary=False,
                                              contributor=contributors['b'])
        assert common.ContributionContributor(contribution=contribution,
                                              primary=True,
                                              contributor=contributors['c'])
        assert common.ContributionContributor(contribution=contribution,
                                              primary=False,
                                              contributor=contributors['d'])

        DBSession.add(contribution)

        language = common.Language(id='language',
                                   name='Language 1',
                                   latitude=10.5,
                                   longitude=0.3)
        language.sources.append(source)
        identifier = common.Identifier(type='iso639-3', id='iso')
        li = common.LanguageIdentifier(language=language,
                                       identifier=identifier)

        for i in range(2, 102):
            _l = common.Language(id='l%s' % i, name='Language %s' % i)
            _i = common.Identifier(type='iso639-3',
                                   id='%.3i' % i,
                                   name='%.3i' % i)
            _li = common.LanguageIdentifier(language=_l, identifier=_i)
            DBSession.add(_l)

        param = common.Parameter(id='parameter', name='Parameter')
        de = common.DomainElement(id='de',
                                  name='DomainElement',
                                  parameter=param)
        de2 = common.DomainElement(id='de2',
                                   name='DomainElement2',
                                   parameter=param)
        valueset = common.ValueSet(id='valueset',
                                   language=language,
                                   parameter=param,
                                   contribution=contribution)
        value = common.Value(id='value',
                             domainelement=de,
                             valueset=valueset,
                             frequency=50,
                             confidence='high')
        DBSession.add(value)
        paramnd = common.Parameter(id='no-domain',
                                   name='Parameter without domain')
        valueset = common.ValueSet(id='vs2',
                                   language=language,
                                   parameter=paramnd,
                                   contribution=contribution)
        vr = common.ValueSetReference(valueset=valueset, source=source)
        value = common.Value(id='v2',
                             valueset=valueset,
                             frequency=50,
                             confidence='high')
        DBSession.add(value)

        unit = common.Unit(id='unit', name='Unit', language=language)
        up = common.UnitParameter(id='unitparameter', name='UnitParameter')
        DBSession.add(unit)
        DBSession.add(
            common.UnitValue(id='unitvalue',
                             name='UnitValue',
                             unit=unit,
                             unitparameter=up))

        up2 = common.UnitParameter(id='up2', name='UnitParameter with domain')
        de = common.UnitDomainElement(id='de', name='de', parameter=up2)
        DBSession.add(
            common.UnitValue(id='uv2',
                             name='UnitValue2',
                             unit=unit,
                             unitparameter=up2,
                             unitdomainelement=de))

        DBSession.add(common.Source(id='s'))

        sentence = common.Sentence(id='sentence',
                                   name='sentence name',
                                   description='sentence description',
                                   analyzed='a\tmorpheme\tdoes\tdo',
                                   gloss='a\tmorpheme\t1SG\tdo.SG2',
                                   source='own',
                                   comment='comment',
                                   original_script='a morpheme',
                                   language=language)
        sr = common.SentenceReference(sentence=sentence, source=source)
        DBSession.add(common.Config(key='key', value='value'))
        DBSession.flush()
Beispiel #2
0
    def setUp(self):
        TestWithDb.setUp(self)

        DBSession.add(
            common.Dataset(id='dataset',
                           name='dataset',
                           description='desc',
                           domain='clld',
                           jsondata={'license_icon': 'cc-by'}))

        DBSession.add(
            common.Source(id='replaced',
                          active=False,
                          jsondata={'__replacement_id__': 'source'}))
        source = common.Source(id='source')
        contributors = {
            'contributor': 'A Name',
            'b': 'b Name',
            'c': 'c Name',
            'd': 'd Name'
        }
        for id_, name in contributors.items():
            contributors[id_] = common.Contributor(id=id_,
                                                   name=name,
                                                   url='http://example.org')

        contribution = common.Contribution(id='contribution',
                                           name='Contribution')
        common.ContributionReference(contribution=contribution, source=source)
        assert common.ContributionContributor(
            contribution=contribution,
            primary=True,
            contributor=contributors['contributor'])
        assert common.ContributionContributor(contribution=contribution,
                                              primary=False,
                                              contributor=contributors['b'])
        assert common.ContributionContributor(contribution=contribution,
                                              primary=True,
                                              contributor=contributors['c'])
        assert common.ContributionContributor(contribution=contribution,
                                              primary=False,
                                              contributor=contributors['d'])

        DBSession.add(contribution)

        language = common.Language(id='language',
                                   name='Language 1',
                                   latitude=10.5,
                                   longitude=0.3)
        language.sources.append(source)
        for i, type_ in enumerate(common.IdentifierType):
            id_ = common.Identifier(type=type_.value,
                                    id=type_.value + str(i),
                                    name='abc')
            common.LanguageIdentifier(language=language, identifier=id_)

        for i in range(2, 102):
            _l = common.Language(id='l%s' % i, name='Language %s' % i)
            _i = common.Identifier(type='iso639-3',
                                   id='%.3i' % i,
                                   name='%.3i' % i)
            common.LanguageIdentifier(language=_l, identifier=_i)
            DBSession.add(_l)

        param = common.Parameter(id='parameter', name='Parameter')
        de = common.DomainElement(id='de',
                                  name='DomainElement',
                                  parameter=param)
        de2 = common.DomainElement(id='de2',
                                   name='DomainElement2',
                                   parameter=param)
        valueset = common.ValueSet(id='valueset',
                                   language=language,
                                   parameter=param,
                                   contribution=contribution)
        value = common.Value(id='value',
                             domainelement=de,
                             valueset=valueset,
                             frequency=50,
                             confidence='high')
        DBSession.add(value)
        value2 = common.Value(id='value2',
                              domainelement=de2,
                              valueset=valueset,
                              frequency=50,
                              confidence='high')
        DBSession.add(value2)
        paramnd = common.Parameter(id='no-domain',
                                   name='Parameter without domain')
        valueset = common.ValueSet(id='vs2',
                                   language=language,
                                   parameter=paramnd,
                                   contribution=contribution)
        common.ValueSetReference(valueset=valueset, source=source)
        value = common.Value(id='v2',
                             valueset=valueset,
                             frequency=50,
                             confidence='high')
        DBSession.add(value)

        unit = common.Unit(id='unit', name='Unit', language=language)
        up = common.UnitParameter(id='unitparameter', name='UnitParameter')
        DBSession.add(unit)
        DBSession.add(
            common.UnitValue(id='unitvalue',
                             name='UnitValue',
                             unit=unit,
                             unitparameter=up))

        up2 = common.UnitParameter(id='up2', name='UnitParameter with domain')
        de = common.UnitDomainElement(id='de', name='de', parameter=up2)
        DBSession.add(
            common.UnitValue(id='uv2',
                             name='UnitValue2',
                             unit=unit,
                             unitparameter=up2,
                             unitdomainelement=de))

        DBSession.add(common.Source(id='s'))

        sentence = common.Sentence(
            id='sentence',
            name='sentence name',
            description='sentence description',
            analyzed='a\tmorpheme\tdoes\tdo',
            gloss='a\tmorpheme\t1SG\tdo.SG2',
            source='own',
            comment='comment',
            original_script='a morpheme',
            language=language,
            jsondata={'alt_translation': 'Spanish: ...'})
        common.SentenceReference(sentence=sentence, source=source)
        DBSession.add(common.Config(key='key', value='value'))

        common.Config.add_replacement('replaced',
                                      'language',
                                      model=common.Language)
        common.Config.add_replacement('gone', None, model=common.Language)
        DBSession.flush()
Beispiel #3
0
def populate_test_db(engine):
    set_alembic_version(engine, '58559d4eea0d')

    data = TestData()
    data.add_default(common.Dataset,
                     domain='clld',
                     jsondata={
                         'license_icon': 'cc-by',
                         'license_url': 'http://example.org'
                     })

    data.add_default(common.Contributor, name='A Name', email='*****@*****.**')
    for id_, name in {
            'b': 'b Name',
            'c': 'c Name',
            'd': 'd Name',
    }.items():
        data.add(common.Contributor,
                 id_,
                 id=id_,
                 name=name,
                 url='http://example.org')

    DBSession.add(
        common.Editor(dataset=data[common.Dataset],
                      contributor=data[common.Contributor]))

    data.add_default(common.Source)
    data.add(common.Source,
             'replaced',
             id='replaced',
             active=False,
             jsondata={'__replacement_id__': 'source'})

    data.add_default(common.Contribution)
    common.ContributionReference(contribution=data[common.Contribution],
                                 source=data[common.Source])

    for primary, c in [(True, 'contributor'), (False, 'b'), (True, 'c'),
                       (False, 'd')]:
        common.ContributionContributor(contribution=data[common.Contribution],
                                       primary=primary,
                                       contributor=data['Contributor'][c])

    data.add_default(common.Language, latitude=10.5, longitude=0.3)
    data[common.Language].sources.append(data[common.Source])

    for i, type_ in enumerate(common.IdentifierType):
        common.LanguageIdentifier(
            language=data[common.Language],
            identifier=common.Identifier(
                type=type_.value,
                id=type_.value + str(i),
                name='abc' if type_.name == 'iso' else 'glot1234'))

    common.LanguageIdentifier(language=data[common.Language],
                              identifier=common.Identifier(type='name',
                                                           id='name',
                                                           name='a'))

    for i in range(2, 102):
        _l = common.Language(id='l%s' % i, name='Language %s' % i)
        _i = common.Identifier(type='iso639-3', id='%.3i' % i, name='abc')
        common.LanguageIdentifier(language=_l, identifier=_i)
        DBSession.add(_l)

    param = data.add_default(common.Parameter)
    de = common.DomainElement(id='de', name='DomainElement', parameter=param)
    de2 = common.DomainElement(id='de2',
                               name='DomainElement2',
                               parameter=param)

    valueset = data.add_default(common.ValueSet,
                                language=data[common.Language],
                                parameter=param,
                                contribution=data[common.Contribution])
    common.ValueSetReference(valueset=valueset,
                             source=data[common.Source],
                             description='10-20')

    data.add_default(common.Value,
                     domainelement=de,
                     valueset=valueset,
                     frequency=50,
                     confidence='high')
    data.add(common.Value,
             'value2',
             id='value2',
             domainelement=de2,
             valueset=valueset,
             frequency=50,
             confidence='high')

    paramnd = data.add(common.Parameter,
                       'no-domain',
                       id='no-domain',
                       name='Parameter without domain')
    valueset = common.ValueSet(id='vs2',
                               language=data[common.Language],
                               parameter=paramnd,
                               contribution=data[common.Contribution])

    common.ValueSetReference(valueset=valueset,
                             source=data[common.Source],
                             description='10-20')
    common.Value(id='v2', valueset=valueset, frequency=50, confidence='high')

    unit = data.add_default(common.Unit, language=data[common.Language])
    up = data.add_default(common.UnitParameter)
    common.UnitValue(id='unitvalue',
                     name='UnitValue',
                     unit=unit,
                     unitparameter=up)

    up2 = common.UnitParameter(id='up2', name='UnitParameter with domain')
    de = common.UnitDomainElement(id='de', name='de', parameter=up2)
    DBSession.add(
        common.UnitValue(id='uv2',
                         name='UnitValue2',
                         unit=unit,
                         unitparameter=up2,
                         unitdomainelement=de))

    DBSession.add(common.Source(id='s'))

    sentence = data.add_default(common.Sentence,
                                description='sentence description',
                                analyzed='a\tmorpheme\tdoes\tdo',
                                gloss='a\tmorpheme\t1SG\tdo.SG2',
                                source='own',
                                comment='comment',
                                original_script='a morpheme',
                                language=data[common.Language],
                                jsondata={'alt_translation': 'Spanish: ...'})
    common.SentenceReference(sentence=sentence, source=data[common.Source])
    DBSession.add(common.Config(key='key', value='value'))

    common.Config.add_replacement('replaced',
                                  'language',
                                  model=common.Language)
    common.Config.add_replacement('gone', None, model=common.Language)
    DBSession.flush()
Beispiel #4
0
def main(args):
    old_db = create_engine(DB)
    data = Data()

    #
    # migrate contributor table: complete
    #
    for row in old_db.execute("select * from contributor"):
        data.add(common.Contributor,
                 row['id'],
                 id=row['id'],
                 name='%(firstname)s %(lastname)s' % row,
                 url=row['homepage'],
                 description=row['note'],
                 email=row['email'],
                 address=row['address'])
    data.add(common.Contributor,
             'haspelmathmartin',
             id='haspelmathmartin',
             name="Martin Haspelmath",
             url="http://email.eva.mpg.de/~haspelmt/")
    DBSession.flush()

    dataset = common.Dataset(
        id='wold',
        name='WOLD',
        description='World Loanword Database',
        domain='wold.clld.org',
        published=date(2009, 8, 15),
        license='http://creativecommons.org/licenses/by/3.0/de/',
        contact='*****@*****.**',
        jsondata={
            'license_icon':
            'http://i.creativecommons.org/l/by/3.0/de/88x31.png',
            'license_name': 'Creative Commons Attribution 3.0 Germany License'
        })
    DBSession.add(dataset)

    for i, editor in enumerate(['haspelmathmartin', 'tadmoruri']):
        common.Editor(dataset=dataset,
                      contributor=data['Contributor'][editor],
                      ord=i + 1)

    #
    # migrate semantic_field table: complete
    #
    for row in old_db.execute("select * from semantic_field"):
        if row['id'] != 25:
            kw = dict((key, row[key]) for key in ['id', 'name', 'description'])
            data.add(models.SemanticField, row['id'], **kw)

    #
    # migrate language table: complete
    # recipient flag is replaced by vocabulary_pk!
    #
    for row in old_db.execute("select * from language order by id"):
        kw = dict((key, row[key]) for key in [
            'fm_dl_id', 'name', 'latitude', 'longitude', 'wals_equivalent',
            'affiliation', 'family', 'genus', 'countries'
        ])
        data.add(models.WoldLanguage, row['id'], id=str(row['id']), **kw)

    #
    # migrate language_code table: complete
    #
    for row in old_db.execute("select * from language_code"):
        _id = '%(type)s-%(code)s' % row
        data.add(common.Identifier,
                 _id,
                 id=_id,
                 type=row['type'],
                 name=row['code'])
        if row['type'] == 'iso639-3' and row['code'] in glottocodes:
            gc = glottocodes[row['code']]
            data.add(common.Identifier,
                     gc,
                     id=gc,
                     type=common.IdentifierType.glottolog.value,
                     name=gc)
    DBSession.flush()

    #
    # migrate language_code_language table: complete
    #
    for row in old_db.execute("select * from language_code_language"):
        _id = '%(type)s-%(code)s' % row
        data.add(common.LanguageIdentifier,
                 '%s-%s' % (_id, row['language_id']),
                 identifier_pk=data['Identifier'][_id].pk,
                 language_pk=data['WoldLanguage'][row['language_id']].pk)
        if row['type'] == 'iso639-3' and row['code'] in glottocodes:
            gc = glottocodes[row['code']]
            data.add(common.LanguageIdentifier,
                     '%s-%s' % (gc, row['language_id']),
                     identifier_pk=data['Identifier'][gc].pk,
                     language_pk=data['WoldLanguage'][row['language_id']].pk)
    DBSession.flush()

    #
    # migrate vocabulary table: complete
    #
    for row in old_db.execute("select * from vocabulary order by id"):
        jsondata = {}
        for key in row.keys():
            if key.startswith('fd_') or key in [
                    'other_information', 'abbreviations'
            ]:
                jsondata[key] = row[key]
        vocab = data.add(models.Vocabulary,
                         row['id'],
                         id=str(row['id']),
                         name=row['name'],
                         color=row['color'],
                         jsondata=jsondata)
        DBSession.flush()
        data['WoldLanguage'][row['language_id']].vocabulary_pk = vocab.pk
    DBSession.flush()

    #
    # migrate contact_situation and age tables: complete
    # contact situations and ages are unitdomainelements!
    #
    contact_situation = common.UnitParameter(id='cs', name='Contact Situation')
    age = common.UnitParameter(id='a', name='Age')

    DBSession.add(contact_situation)
    DBSession.add(age)
    DBSession.flush()

    for row in old_db.execute("select * from contact_situation"):
        if row['vocabulary_id'] is None:
            continue
        kw = dict((key, row[key]) for key in ['description', 'id', 'name'])
        kw['id'] = 'cs-%s' % kw['id']
        p = data.add(models.WoldUnitDomainElement, row['id'], **kw)
        p.vocabulary = data['Vocabulary'][row['vocabulary_id']]
        p.unitparameter_pk = contact_situation.pk

    for row in old_db.execute("select * from age"):
        id_ = '%(vocabulary_id)s-%(label)s' % row
        kw = dict((key, row[key]) for key in ['start_year', 'end_year'])
        p = data.add(models.WoldUnitDomainElement,
                     id_,
                     id='a-%s' % id_,
                     name=row['label'],
                     description=row['description'],
                     jsondata=kw)
        p.vocabulary = data['Vocabulary'][row['vocabulary_id']]
        p.unitparameter_pk = age.pk

    #
    # migrate meaning table: complete
    #
    for row in old_db.execute("select * from meaning"):
        kw = dict((key, row[key]) for key in [
            'description', 'core_list', 'ids_code', 'typical_context',
            'semantic_category'
        ])
        p = data.add(
            models.Meaning,
            row['id'],
            id=row['id'].replace('.', '-'),
            name=row['label'],
            sub_code=row['id'].split('.')[1] if '.' in row['id'] else '',
            semantic_field=data['SemanticField'][row['semantic_field_id']],
            **kw)
        DBSession.flush()

        for field in ['french', 'spanish', 'german', 'russian']:
            DBSession.add(
                models.Translation(name=row[field], lang=field, meaning=p))

        for key in data['WoldLanguage']:
            lang = data['WoldLanguage'][key]
            data.add(common.ValueSet,
                     '%s-%s' % (key, row['id']),
                     id='%s-%s' % (key, row['id'].replace('.', '-')),
                     language=lang,
                     contribution=lang.vocabulary,
                     parameter=p)

    DBSession.flush()

    #
    # migrate word table:
    # TODO: all the other word properties!!
    #
    fields = [
        'age_label',
        'original_script',
        'grammatical_info',
        'comment_on_word_form',
        'gloss',
        "comment_on_borrowed",
        "calqued",
        "borrowed_base",
        "numeric_frequency",
        "relative_frequency",
        "effect",
        "integration",
        "salience",
        "reference",
        "other_comments",
        "register",
        "loan_history",
        'colonial_word',
        'paraphrase_in_dutch',
        'word_source',
        'paraphrase_in_german',
        'lexical_stratum',
        'comparison_with_mandarin',
        'year',
        'comparison_with_korean',
        'czech_translation',
        'hungarian_translation',
        'early_romani_reconstruction',
        'etymological_note',
        'boretzky_and_igla_etymology',
        'manuss_et_al_etymology',
        'vekerdi_etymology',
        'turner_etymology',
        'other_etymologies',
        'mayrhofer_etymology',
    ]
    word_to_vocab = {}
    for row in old_db.execute("select * from word"):
        word_to_vocab[row['id']] = row['vocabulary_id']
        kw = dict((key, row[key]) for key in [
            'id', 'age_score', 'borrowed', 'borrowed_score', 'analyzability',
            'simplicity_score'
        ])
        w = data.add(models.Word,
                     row['id'],
                     name=row['form'],
                     description=row['free_meaning'],
                     jsondata={k: row[k]
                               for k in fields},
                     **kw)
        w.language = data['Vocabulary'][row['vocabulary_id']].language

        if row['age_label']:
            DBSession.add(
                common.UnitValue(
                    id='%(id)s-a' % row,
                    unit=w,
                    unitparameter=age,
                    unitdomainelement=data['WoldUnitDomainElement'][
                        '%(vocabulary_id)s-%(age_label)s' % row],
                    contribution=data['Vocabulary'][row['vocabulary_id']]))

        if row['contact_situation_id'] and row[
                'contact_situation_id'] != '9129144185487768':
            DBSession.add(
                common.UnitValue(
                    id='%(id)s-cs' % row,
                    unit=w,
                    unitparameter=contact_situation,
                    unitdomainelement=data['WoldUnitDomainElement'][
                        row['contact_situation_id']],
                    contribution=data['Vocabulary'][row['vocabulary_id']]))

    DBSession.flush()

    #
    # migrate word_meaning table: complete
    #
    for i, row in enumerate(old_db.execute("select * from word_meaning")):
        data.add(
            models.Counterpart,
            i,
            id=i,
            description='%(relationship)s (%(comment_on_relationship)s)' % row,
            name=data['Word'][row['word_id']].name,
            valueset=data['ValueSet']['%s-%s' % (word_to_vocab[row['word_id']],
                                                 row['meaning_id'])],
            word=data['Word'][row['word_id']])
    DBSession.flush()

    #
    # migrate vocabulary_contributor table: complete
    #
    for row in old_db.execute("select * from vocabulary_contributor"):
        DBSession.add(
            common.ContributionContributor(
                ord=row['ordinal'],
                primary=row['primary'],
                contributor_pk=data['Contributor'][row['contributor_id']].pk,
                contribution_pk=data['Vocabulary'][row['vocabulary_id']].pk))

    DBSession.flush()

    #
    # source words: we have to make sure a word does only belong to one language.
    # thus, we have to reassign identifier!
    #
    # loop over source_word, source_word_donor_language pairs keeping track of source_word ids:
    known_ids = {}

    for row in old_db.execute(
            "select sw.id, sw.meaning, sw.form, dl.language_id from source_word as sw, source_word_donor_language as dl where sw.id = dl.source_word_id"
    ):
        if row['id'] in known_ids:
            # source_word was already seen associated to a different donor language!
            assert row['language_id'] not in known_ids[row['id']]
            known_ids[row['id']].append(row['language_id'])
            id_ = '%s-%s' % (row['id'], len(known_ids[row['id']]))
        else:
            id_ = '%s-%s' % (row['id'], 1)
            known_ids[row['id']] = [row['language_id']]

        new = data.add(models.Word,
                       id_,
                       id=id_,
                       name=row['form'],
                       description=row['meaning'])
        new.language = data['WoldLanguage'][row['language_id']]

    # source words may end up as words without language!
    for row in old_db.execute(
            "select id, meaning, form from source_word where id not in (select source_word_id from source_word_donor_language)"
    ):
        id_ = '%s-%s' % (row['id'], 1)
        new = data.add(models.Word,
                       id_,
                       id=id_,
                       name=row['form'],
                       description=row['meaning'])

    DBSession.flush()

    #
    # migrate word_source_word relations
    # TODO: should be modelled as UnitParameter!
    #
    j = 0
    for row in old_db.execute("select * from word_source_word"):
        # there may be more than one word associated with a source_word_id (see above)
        source_words = []
        for i in range(4):  # but we guess no more than 4 :)
            id_ = '%s-%s' % (row['source_word_id'], i + 1)
            if id_ in data['Word']:
                source_words.append(data['Word'][id_])
        if not source_words:
            j += 1
            #print(row['source_word_id'])
            #raise ValueError(row['source_word_id'])

        for sw in source_words:
            DBSession.add(
                models.Loan(source_word=sw,
                            target_word=data['Word'][row['word_id']],
                            relation=row['relationship'],
                            certain=len(source_words) == 1))

    print('%s source words not migrated because they have no donor language!' %
          j)