コード例 #1
0
    def load_examples(self, dictionary, data, lang):
        abbr_p = re.compile('\$(?P<abbr>[a-z1-3][a-z]*(\.[a-z]+)?)')
        for i, ex in enumerate(
                Examples.from_file(self.dir.joinpath('processed', 'examples.sfm'))):
            obj = data.add(
                models.Example,
                ex.id,
                id='%s-%s' % (self.id, ex.id.replace('.', '_')),
                name=ex.text,
                number='{0}'.format(i + 1),
                source=ex.corpus_ref,
                language=lang,
                serialized='{0}'.format(ex),
                dictionary=dictionary,
                analyzed=ex.morphemes,
                gloss=abbr_p.sub(lambda m: m.group('abbr').upper(), ex.gloss) if ex.gloss else ex.gloss,
                description=ex.translation,
                alt_translation1=ex.alt_translation,
                alt_translation_language1=self.props.get('metalanguages', {}).get('gxx'),
                alt_translation2=ex.alt_translation2,
                alt_translation_language2=self.props.get('metalanguages', {}).get('gxy'))
            DBSession.flush()

            if ex.soundfile:
                self.add_file('audio', ex.soundfile, common.Sentence_files, obj)
コード例 #2
0
 def setUp(self):
     TestWithDb.setUp(self)
     DBSession.add(common.Dataset(id='d', name='test', domain='localhost'))
     family = Family(id='f', name='family', description='desc')
     DBSession.add(LanguageWithFamily(id='l1', family=family))
     DBSession.add(LanguageWithFamily(id='l2'))
     DBSession.flush()
コード例 #3
0
 def setUp(self):
     super(_TestWithDb, self).setUp()
     DBSession.add(common.Dataset(id='d', name='test', domain='localhost'))
     family = Family(id='f', name='family', description='desc', jsondata=dict(icon=1))
     DBSession.add(LanguageWithFamily(id='l1', family=family))
     DBSession.add(LanguageWithFamily(id='l2'))
     DBSession.flush()
コード例 #4
0
ファイル: initializedb.py プロジェクト: kublaj/glottolog3
 def create_languoid(row, father_pk=None):
     glottocode = {'akun1242': 'akun1241'}.get(row['alnumcode'], row['alnumcode'])
     attrs = dict(
         pk=row['id'],
         id=glottocode,
         name=row['primaryname'],
         description=row['globalclassificationcomment'],
         level=getattr(models2.LanguoidLevel, row['level']),
         status=getattr(models2.LanguoidStatus, (row['status'] or '').replace(' ', '_'), None),
         father_pk=father_pk,
         created=row['updated'],
         jsondata={} if not row['hname'] else {'hname': row['hname']},
     )
     for attr in [
         'active',
         'updated',
         'hid',
         'latitude',
         'longitude',
     ]:
         attrs[attr] = row[attr]
     l = data.add(models2.Languoid, row['id'], **attrs)
     for type_ in params:
         id_ = '%s%s' % (type_, row['id'])
         vs = data.add(
             common.ValueSet, id_,
             id=id_,
             description=row['classificationcomment'] if type_ == 'fc' else row['subclassificationcomment'],
             language=l,
             parameter=params[type_],
             contribution=contrib)
         data.add(common.Value, id_, id=id_, name='%s - %s' % (row['level'], row['status']), valueset=vs)
         DBSession.flush()
         valuesets[id_] = vs.pk
     return str(row['id'])
コード例 #5
0
ファイル: views.py プロジェクト: uwblueprint/glottolog3
def put_languoid(request):
    glottocode = request.matchdict['glottocode']
    languoid = query_languoid(DBSession, glottocode)
    if languoid is None:
        request.response.status = 404
        return {'error': 'Not a valid languoid ID'}

    json_data = request.json_body
    try:
        data, errors = LanguoidSchema(partial=True).load(json_data)
    except ValueError:
        request.response.status = 400
        return {'error': 'Not a valid languoid level'}
    if errors:
        request.response.status = 400
        return {'error': errors}

    try:
        for key, value in data.items():
            setattr(languoid, key, value)
        DBSession.flush()
    except exc.SQLAlchemyError as e:
        request.response.status = 400
        DBSession.rollback()
        return {'error': "{}".format(e)}

    return LanguoidSchema().dump(languoid).data
コード例 #6
0
def main(args):
    data = Data()

    dataset = common.Dataset(
        id=u'An Crúbadán',
        name=u'An Crúbadán',
        publisher_name="Saint Louis University",
        publisher_place="Saint Louis, USA",
        publisher_url="http://www.slu.edu/",
        description=
        "Linguistic datasets for over 2000 languages created from web-crawled text corpora",
        contact="*****@*****.**",
        license='http://creativecommons.org/licenses/by/4.0/',
        jsondata={
            'license_icon':
            'https://licensebuttons.net/l/by/4.0/88x31.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License',
        },
        domain='crubadan.org',
    )

    DBSession.add(dataset)
    DBSession.flush()

    editor = data.add(common.Contributor,
                      "Kevin Scannell",
                      id="Kevin Scannell",
                      name="Kevin Scannell",
                      email="*****@*****.**")
    common.Editor(dataset=dataset, contributor=editor, ord=0)
    DBSession.flush()

    fillTable(DBSession)
コード例 #7
0
ファイル: test_db_meta.py プロジェクト: mitcho/clld
    def test_CustomModelMixin(self):
        from clld.tests.fixtures import CustomLanguage

        DBSession.add(CustomLanguage(id='abc', name='Name', custom='c'))
        DBSession.flush()
        for lang in DBSession.query(Language).filter(Language.id == 'abc'):
            self.assertEqual(lang.custom, 'c')
            break
コード例 #8
0
ファイル: test_db_models.py プロジェクト: mitcho/clld
    def test_Dataset(self):
        from clld import RESOURCES
        from clld.db.models.common import Dataset, Source

        d = Dataset(id='abc', domain='test')
        DBSession.add(d)
        DBSession.flush()
        d.get_stats(RESOURCES, source=Source.id == None)
コード例 #9
0
    def test_CustomModelMixin(self):
        from clld.tests.fixtures import CustomLanguage

        DBSession.add(CustomLanguage(id='abc', name='Name', custom='c'))
        DBSession.flush()
        for lang in DBSession.query(Language).filter(Language.id == 'abc'):
            self.assertEqual(lang.custom, 'c')
            break
コード例 #10
0
    def test_Dataset(self):
        from clld import RESOURCES
        from clld.db.models.common import Dataset, Source

        d = Dataset(id='abc', domain='test')
        DBSession.add(d)
        DBSession.flush()
        d.get_stats(RESOURCES, source=Source.id == None)
コード例 #11
0
ファイル: import_tree.py プロジェクト: pombredanne/glottolog3
def main(args):  # pragma: no cover
    global MAX_IDENTIFIER_PK

    with transaction.manager:
        MAX_IDENTIFIER_PK = DBSession.query(
            Identifier.pk).order_by(desc(Identifier.pk)).first()[0]

        gl_name = glottolog_name()
        gl_names = glottolog_names()

        languoids = {l.pk: l for l in DBSession.query(Languoid)}
        for attrs in jsonload(args.data_dir.joinpath('languoids', 'changes.json')):
            replacement = attrs.pop('replacement', None)
            hname = attrs.pop('hname', None)

            for name, enum in [('level', LanguoidLevel), ('status', LanguoidStatus)]:
                if name in attrs:
                    attrs[name] = enum.from_string(attrs[name])

            l = languoids.get(attrs['pk'])
            if l:
                for k, v in attrs.items():
                    setattr(l, k, v)
                #
                # We do not assign ISO codes for existing languages, because it could be
                # that the ISO code is now assigned to a family node, due to a change
                # request, e.g. see https://github.com/clld/glottolog-data/issues/40
                #
                if len(l.hid or '') == 3 and not l.iso_code:
                    args.log.warn('Language with hid %s but no iso code!' % l.hid)
            else:
                l = Languoid(**attrs)
                DBSession.add(l)
                languoids[l.pk] = l

                if len(attrs.get('hid', '')) == 3:
                    create_identifier(
                        None, l, name=attrs['hid'], type=IdentifierType.iso.value)

                create_identifier(
                    gl_names.get(l.name),
                    l,
                    name=l.name,
                    description=gl_name.description,
                    type=gl_name.type)

            if hname:
                l.update_jsondata(hname=hname)

            if replacement:
                DBSession.add(Superseded(
                    languoid_pk=l.pk,
                    replacement_pk=replacement,
                    relation='classification update'))

            DBSession.flush()

        recreate_treeclosure()
コード例 #12
0
ファイル: test_db_models.py プロジェクト: cevmartinez/clld
    def test_Data(self):
        from clld.db.models.common import Language, Language_data

        l = Language(id='abc', name='Name')
        l.data.append(Language_data(key='abstract', value='c'))
        DBSession.add(l)
        DBSession.flush()
        DBSession.refresh(l)
        self.assertEqual(l.datadict()['abstract'], 'c')
コード例 #13
0
ファイル: test_db_meta.py プロジェクト: esbesb/clld
def test_JSONEncodedDict(db):
    l = Language(id='abc', name='Name', jsondata={'i': 2})
    DBSession.add(l)
    DBSession.flush()

    DBSession.expunge(l)
    for lang in DBSession.query(Language).filter(Language.id == 'abc'):
        assert lang.jsondata['i'] == 2
        break
コード例 #14
0
ファイル: test_db_meta.py プロジェクト: pepe-localhost/clld
def test_Base_jsondata(db):
    l = Language(id='abc', name='Name')
    DBSession.add(l)
    DBSession.flush()
    l.update_jsondata(a=1)
    assert 'a' in l.jsondata
    l.update_jsondata(b=1)
    assert 'b' in l.jsondata and 'a' in l.jsondata
    assert 'b' in l.__json__(None)['jsondata']
コード例 #15
0
ファイル: test_db_meta.py プロジェクト: clld/clld
def test_JSONEncodedDict(db):
    l = Language(id='abc', name='Name', jsondata={'i': 2})
    DBSession.add(l)
    DBSession.flush()

    DBSession.expunge(l)
    for lang in DBSession.query(Language).filter(Language.id == 'abc'):
        assert lang.jsondata['i'] == 2
        break
コード例 #16
0
ファイル: test_db_models.py プロジェクト: clld/clld
def test_Data(db):
    from clld.db.models.common import Language, Language_data

    l = Language(id='abc', name='Name')
    l.data.append(Language_data(key='abstract', value='c'))
    DBSession.add(l)
    DBSession.flush()
    DBSession.refresh(l)
    assert l.datadict()['abstract'] == 'c'
コード例 #17
0
ファイル: initializedb.py プロジェクト: Castroyesid/wals3
def migrate(from_, to_, converter):  # pragma: no cover
    for row in DB.execute("select * from %s" % from_):
        res = converter(row)
        if not res:
            continue
        if isinstance(res, dict):
            DBSession.add(to_(**res))
        else:
            data.add(to_, res[0], **res[1])
    DBSession.flush()
コード例 #18
0
    def test_compute_language_sources(self):
        from clld.db.models.common import Source, Sentence, Language, SentenceReference
        from clld.db.meta import DBSession
        from clld.db.util import compute_language_sources

        s = Sentence(id='sentenced', language=Language(id='newlang'))
        sr = SentenceReference(sentence=s, source=Source.first())
        DBSession.add(sr)
        DBSession.flush()
        compute_language_sources()
コード例 #19
0
ファイル: test_db_meta.py プロジェクト: pepe-localhost/clld
def test_Base(db):
    l = Language(id='abc', name='Name')
    DBSession.add(l)
    DBSession.flush()
    DBSession.expunge(l)
    l = Language.get('abc', session=DBSession)
    assert l.name == 'Name'

    Language().__str__()
    assert repr(l) == "<Language 'abc'>"
コード例 #20
0
ファイル: initializedb.py プロジェクト: JChungYS/wals3
def migrate(from_, to_, converter):  # pragma: no cover
    for row in DB.execute("select * from %s" % from_):
        res = converter(row)
        if not res:
            continue
        if isinstance(res, dict):
            DBSession.add(to_(**res))
        else:
            data.add(to_, res[0], **res[1])
    DBSession.flush()
コード例 #21
0
ファイル: util.py プロジェクト: HedvigS/grambank
def import_features(cldf, contributors):  # pragma: no cover
    """
    ? = gray cbbbbbb (is ? mapped? if not then don't worry)
    0 = blue c0077bb
    1 = red ccc3311
    2 = teal c009988
    3 = orange cee7733
    """
    features, codes = {}, {}
    icons = [
        'cffffff',  # 'c0077bb'
        'cff0000',  # 'ccc3311'
        'c0000ff',  # 'c009988'
        'cffff00',  # 'cee7733'
    ]
    domains = {}
    for fid, des in itertools.groupby(
            sorted(cldf['CodeTable'], key=lambda c: c['Parameter_ID']),
            lambda c: c['Parameter_ID']):
        domains[fid] = list(des) + [
            dict(ID=fid + '-NA', Name='?', Description='Not known')
        ]

    for feature in tqdm(list(cldf['ParameterTable']), desc='loading features'):
        fid = feature['ID']
        f = Feature(
            id=fid,
            name=feature['Name'],
            description=feature['Description'],
        )
        for ord, patron in enumerate(feature['Patrons'], start=1):
            DBSession.add(
                FeaturePatron(ord=1,
                              feature=f,
                              contributor_pk=contributors[patron]))
        for code in domains[fid]:
            if code['Name'] == '?':
                icon, number, value = 'tcccccc', 999, None
            else:
                icon, number, value = icons[int(code['Name'])], int(
                    code['Name']), code['Name']
            DomainElement(id=code['ID'],
                          parameter=f,
                          name=code['Name'],
                          number=number,
                          description=code['Description'],
                          jsondata=dict(icon=icon))
        DBSession.add(f)
        DBSession.flush()
        features[fid] = f.pk
        for de in f.domain:
            codes[de.id] = de.pk

    return features, codes
コード例 #22
0
def _addSource(lp):
    """For a lighter 'main' function."""

    DBSession.add(
        common.Source(id=lp[0],
                      name=lp[0],
                      author=lp[2],
                      year=lp[3],
                      title=lp[4],
                      url=lp[5],
                      note=lp[6]))
    DBSession.flush()
コード例 #23
0
ファイル: test_db_models.py プロジェクト: mitcho/clld
    def test_Files(self):
        from clld.db.models.common import Language, Language_files

        if PY3:
            return  # pragma: no cover

        l = Language(id='abc', name='Name')
        assert l.iso_code is None
        l._files.append(Language_files(id='abstract'))
        DBSession.add(l)
        DBSession.flush()
        DBSession.refresh(l)
        f = l.files['abstract']
コード例 #24
0
    def test_Files(self):
        from clld.db.models.common import Language, Language_files

        if PY3:
            return  # pragma: no cover

        l = Language(id='abc', name='Name')
        assert l.iso_code is None
        l._files.append(Language_files(id='abstract'))
        DBSession.add(l)
        DBSession.flush()
        DBSession.refresh(l)
        f = l.files['abstract']
コード例 #25
0
ファイル: test_db_models.py プロジェクト: clld/clld
def test_Files(db, tmppath):
    from clld.db.models.common import Sentence, Sentence_files

    l = Sentence(id='abc', name='Name')
    f = Sentence_files(object=l, id='abstract', mime_type='audio/mpeg')
    p = f.create(Path(tmppath), 'content')
    assert Path(p).exists()

    l._files.append(f)
    DBSession.add(l)
    DBSession.flush()
    DBSession.refresh(l)
    assert l.files
    assert l.audio
コード例 #26
0
def main(args):
    data = Data()
    data.add(
        common.Dataset,
        "starling",
        id="starling",
        name="Starling",
        domain="starling.rinet.ru",
        description=" The Global Lexicostatistical Database",
        publisher_name="Russian State University for the Humanities, Moscow")
    data.add(common.Contribution, "starling", name="Starling", id="starling")

    def row_to_dict(row_entry):
        swadesh_id_idx, swadesh_word_idx, form_idx, cognation_idx, notes_idx = range(
            0, 5)
        return {
            "swadesh_id": row_entry[swadesh_id_idx].value,
            "swadesh_word": row_entry[swadesh_word_idx].value,
            "form": row_entry[form_idx].value,
            "cognation_index": row_entry[cognation_idx].value,
            "notes": row_entry[notes_idx].value,
        }

    data_dir = "./gld/scripts/data/"
    for path in os.listdir(data_dir):
        data_file_path = os.path.join(data_dir, path)
        book = load_workbook(data_file_path)
        sheet = book.active
        lang_name = sheet["C1"].value
        data.add(common.Language,
                 lang_name,
                 id=lang_name,
                 name=lang_name,
                 latitude=52.0,
                 longitude=0.0)

        fields = [
            "swadesh_id", "swadesh_word", "form", "cognation_index", "notes"
        ]
        for row in sheet.iter_rows(min_row=2, min_col=1):
            row_data = row_to_dict(row)
            w = data.add(Word,
                         "%s_%s" % (row_data["swadesh_id"], row_data["form"]),
                         name=row_data["form"],
                         description="Description",
                         jsondata={k: row_data[k]
                                   for k in fields})
            w.language = data["Language"][lang_name]

        DBSession.flush()
コード例 #27
0
def test_Files(db, tmppath):
    from clld.db.models.common import Sentence, Sentence_files

    l = Sentence(id='abc', name='Name')
    f = Sentence_files(object=l, id='abstract', mime_type='audio/mpeg')
    p = f.create(tmppath, 'content')
    assert Path(p).exists()

    l._files.append(f)
    DBSession.add(l)
    DBSession.flush()
    DBSession.refresh(l)
    assert l.files
    assert l.audio
コード例 #28
0
    def test_CustomModelMixin_polymorphic(self):
        from clld.tests.fixtures import CustomLanguage

        lang = Language(id='def', name='Name')
        clang = CustomLanguage(id='abc', name='Name', custom='c')
        DBSession.add_all([lang, clang])
        DBSession.flush()
        DBSession.expunge_all()
        lang = DBSession.query(Language).filter_by(id='def').one()
        clang = DBSession.query(Language).filter_by(id='abc').one()

        self.assertEqual(lang.polymorphic_type, 'base')
        self.assertEqual(clang.polymorphic_type, 'custom')
        self.assertIs(type(lang), Language)
        self.assertIs(type(clang), CustomLanguage)
コード例 #29
0
ファイル: test_db_meta.py プロジェクト: cevmartinez/clld
    def test_CustomModelMixin_polymorphic(self):
        from clld.tests.fixtures import CustomLanguage

        lang = Language(id='def', name='Name')
        clang = CustomLanguage(id='abc', name='Name', custom='c')
        DBSession.add_all([lang, clang])
        DBSession.flush()
        DBSession.expunge_all()
        lang = DBSession.query(Language).filter_by(id='def').one()
        clang = DBSession.query(Language).filter_by(id='abc').one()

        self.assertEqual(lang.polymorphic_type, 'base')
        self.assertEqual(clang.polymorphic_type, 'custom')
        self.assertIs(type(lang), Language)
        self.assertIs(type(clang), CustomLanguage)
コード例 #30
0
 def test_CsvMixin(self):
     l1 = Language(id='abc', name='Name', latitude=12.4, jsondata=dict(a=None))
     DBSession.add(l1)
     DBSession.flush()
     l1 = Language.csv_query(DBSession).first()
     cols = l1.csv_head()
     row = l1.to_csv()
     for k, v in zip(cols, row):
         if k == 'jsondata':
             self.assertIn('a', json.loads(v))
     l2 = Language.from_csv(row)
     assert_almost_equal(l1.latitude, l2.latitude)
     row[cols.index('latitude')] = '3,5'
     l2 = Language.from_csv(row)
     self.assertLess(l2.latitude, l1.latitude)
コード例 #31
0
ファイル: test_db_meta.py プロジェクト: clld/clld
def test_CsvMixin(db):
    l1 = Language(id='abc', name='Name', latitude=12.4, jsondata=dict(a=None))
    DBSession.add(l1)
    DBSession.flush()
    l1 = Language.csv_query(DBSession).first()
    cols = l1.csv_head()
    row = l1.to_csv()
    for k, v in zip(cols, row):
        if k == 'jsondata':
            assert 'a' in json.loads(v)
    l2 = Language.from_csv(row)
    assert pytest.approx(l1.latitude) == l2.latitude
    row[cols.index('latitude')] = '3,5'
    l2 = Language.from_csv(row)
    assert l2.latitude < l1.latitude
コード例 #32
0
def main(args):
    with open(args.data_file('2.3', 'obsolete_refs.json')) as fp:
        obsolete = json.load(fp)

    with transaction.manager:
        provider = Provider.get('glottolog20121')
        for ref in provider.refs:
            if ref.id in obsolete:
                Config.add_replacement(ref, None, session=DBSession, model=Source)
                DBSession.delete(ref)
            else:
                assert len(ref.providers) > 1

        DBSession.flush()
        DBSession.delete(provider)
コード例 #33
0
ファイル: test_db_meta.py プロジェクト: esbesb/clld
def test_CsvMixin(db):
    l1 = Language(id='abc', name='Name', latitude=12.4, jsondata=dict(a=None))
    DBSession.add(l1)
    DBSession.flush()
    l1 = Language.csv_query(DBSession).first()
    cols = l1.csv_head()
    row = l1.to_csv()
    for k, v in zip(cols, row):
        if k == 'jsondata':
            assert 'a' in json.loads(v)
    l2 = Language.from_csv(row)
    assert pytest.approx(l1.latitude) == l2.latitude
    row[cols.index('latitude')] = '3,5'
    l2 = Language.from_csv(row)
    assert l2.latitude < l1.latitude
コード例 #34
0
 def add_file(self, type_, checksum, file_cls, obj):
     if checksum in self.cdstar:
         jsondata = {k: v for k, v in self.props.get(type_, {}).items()}
         jsondata.update(self.cdstar[checksum])
         f = file_cls(
             id='%s-%s' % (obj.id, checksum),
             name=self.cdstar[checksum]['original'],
             object_pk=obj.pk,
             mime_type=self.cdstar[checksum]['mimetype'],
             jsondata=jsondata)
         DBSession.add(f)
         DBSession.flush()
         DBSession.refresh(f)
         return
     print('{0} file missing: {1}'.format(type_, checksum))
     return
コード例 #35
0
ファイル: test_db_meta.py プロジェクト: esbesb/clld
def test_CustomModelMixin_polymorphic(db, custom_language):
    lang = Language(id='def', name='Name')
    assert repr(lang).startswith("<Language ")
    assert is_base(Language)
    assert not is_base(custom_language)
    clang = custom_language(id='abc', name='Name', custom='c')
    DBSession.add_all([lang, clang])
    DBSession.flush()
    DBSession.expunge_all()
    lang = DBSession.query(Language).filter_by(id='def').one()
    clang = DBSession.query(Language).filter_by(id='abc').one()

    assert lang.polymorphic_type == 'base'
    assert clang.polymorphic_type == 'custom'
    assert type(lang) is Language
    assert type(clang) is custom_language
コード例 #36
0
def _addEditor(dataset, count, lp):
    """For a lighter 'main' function."""
    eds = ['Frank Seifart', 'Ludger Paschen', 'Matthew Stave']
    ed = dorEditor(id=lp[0],
                   name=lp[0],
                   url=lp[1],
                   email=lp[2],
                   address=lp[3],
                   team=lp[4],
                   function=lp[5])
    if lp[0] in eds:
        common.Editor(dataset=dataset, contributor=ed, ord=count + 1)
        count += 1
    DBSession.add(ed)
    DBSession.flush()
    return dataset, count
コード例 #37
0
ファイル: test_db_meta.py プロジェクト: clld/clld
def test_CustomModelMixin_polymorphic(db, custom_language):
    lang = Language(id='def', name='Name')
    assert repr(lang).startswith("<Language ")
    assert is_base(Language)
    assert not is_base(custom_language)
    clang = custom_language(id='abc', name='Name', custom='c')
    DBSession.add_all([lang, clang])
    DBSession.flush()
    DBSession.expunge_all()
    lang = DBSession.query(Language).filter_by(id='def').one()
    clang = DBSession.query(Language).filter_by(id='abc').one()

    assert lang.polymorphic_type == 'base'
    assert clang.polymorphic_type == 'custom'
    assert type(lang) is Language
    assert type(clang) is custom_language
コード例 #38
0
ファイル: test_db_models.py プロジェクト: cevmartinez/clld
    def test_Files(self):
        from clld.db.models.common import Sentence, Sentence_files

        if PY3:
            return  # pragma: no cover

        l = Sentence(id='abc', name='Name')
        f = Sentence_files(object=l, id='abstract', mime_type='audio/mpeg')
        p = f.create(Path(mkdtemp()).joinpath('clldtest').as_posix(), 'content')
        assert os.path.exists(p)
        rmtree(Path(p).parent.parent)
        l._files.append(f)
        DBSession.add(l)
        DBSession.flush()
        DBSession.refresh(l)
        assert l.files
        assert l.audio
コード例 #39
0
def _addText(lp):
    """For a lighter 'main' function and because of checks."""

    for a in range(1, len(lp)):
        if a == 18:
            if lp[a] == "no":
                lp[a] = False
            else:
                lp[a] = True
        elif a == 17:
            if not lp[a] or str(lp[a]).startswith("check"):
                lp[a] = 0
        elif a == 9:
            genre = lp[9].lower()
            if genre == "personal narrative":
                genre = "pers. narr."
            elif genre == "traditional narrative":
                genre = "trad. narr."
            elif genre == "conversation":
                genre = "convers."
            elif genre == "stimulus-based":
                genre = "stimulus"
            lp[9] = genre
        elif not lp[a]:
            lp[a] = 'na'
    DBSession.add(
        doreContrib(id=lp[1],
                    tname=lp[2],
                    spks=lp[3],
                    spks_age=lp[4],
                    spks_agec=lp[5],
                    spks_sex=lp[6],
                    recdate=lp[7],
                    recdatec=lp[8],
                    genre=lp[9],
                    subgenre=lp[10],
                    gloss=lp[11],
                    transl=lp[12],
                    sound=lp[13],
                    overlap=lp[14],
                    process=lp[15],
                    NAK=lp[16],
                    glottocode=lp[0],
                    words=lp[17],
                    extended=lp[18]))
    DBSession.flush()
コード例 #40
0
ファイル: views.py プロジェクト: uwblueprint/glottolog3
def put_identifier(request):
    REQ_FIELDS = ['name', 'type']
    OPT_FIELDS = ['description', 'lang']
    is_partial = False
    new_identifier = request.json_body
    id_query, errors = query_identifier(request.matchdict['type'],
                                        request.matchdict['name'])
    if errors:
        request.response.status = 404
        return {'error': errors}

    identifier = id_query.first()

    if not any (k in new_identifier for k in REQ_FIELDS):
        is_partial = True
    else:
        all_fields = REQ_FIELDS + OPT_FIELDS
        update_fields = (k for k in all_fields if k not in new_identifier)
        for field in update_fields:
            new_identifier[field] = getattr(identifier, field)

    try:
        data, errors = IdentifierSchema(partial=is_partial).load(new_identifier)
    except (ValueError, ValidationError) as e:
        request.response.status = 400
        return {'error': '{}'.format(e)}
    if errors:
        request.response.status = 400
        return {'error': errors}

    try:
        for key in new_identifier:
            # Cannot direct lookup on identifier object
            setattr(identifier, key, getattr(data, key))

        DBSession.flush()
        result = json.dumps(IdentifierSchema().dump(identifier))
    except exc.SQLAlchemyError as e:
        request.response.status = 400
        DBSession.rollback()
        return { 'error': '{}'.format(e) }

    # Commit if no errors
    transaction.commit()

    return result
コード例 #41
0
ファイル: views.py プロジェクト: uwblueprint/glottolog3
def delete_languoid(request):
    glottocode = request.matchdict['glottocode']
    languoid = query_languoid(DBSession, glottocode)
    if languoid is None:
        request.response.status = 404
        return {'error': 'Not a valid languoid ID'}

    try:
        languoid.active = False
        DBSession.flush()
    except exc.SQLAlchemyError as e:
        request.response.status = 400
        DBSession.rollback()
        return {'error': "{}".format(e)}

    request.response.status = 204
    return LanguoidSchema().dump(languoid).data
コード例 #42
0
def prime_cache(args):  # pragma: no cover
    """If data needs to be denormalized for lookup, do that here.
    This procedure should be separate from the db initialization, because
    it will have to be run periodically whenever data has been updated.
    """
    if 1:
        langs = {l.pk: l for l in DBSession.query(models.GrambankLanguage)}
        features = {f.pk: f for f in DBSession.query(models.Feature)}

        for lpk, nf in DBSession.query(common.ValueSet.language_pk, func.count(common.ValueSet.pk)) \
                .join(common.Value, common.Value.valueset_pk == common.ValueSet.pk) \
                .group_by(common.ValueSet.language_pk):
            langs[lpk].representation = nf

        for fpk, nl in DBSession.query(common.ValueSet.parameter_pk, func.count(common.ValueSet.pk))\
                .join(common.Value, common.Value.valueset_pk == common.ValueSet.pk)\
                .group_by(common.ValueSet.parameter_pk):
            features[fpk].representation = nl

        compute_language_sources()

    get_repos()

    for obj in DBSession.query(LanguageTreeLabel).all():
        DBSession.delete(obj)
    for obj in DBSession.query(TreeLabel).all():
        DBSession.delete(obj)
    for obj in DBSession.query(Phylogeny).all():
        DBSession.delete(obj)
    DBSession.flush()

    for tree in tqdm(
            iter_trees([l.id for l in DBSession.query(common.Language)],
                       Glottolog(REPOS['glottolog']))):
        nodes = set(n.name for n in tree.traverse())
        phylo = Phylogeny(id=tree.name.split('_')[1],
                          name=tree.name,
                          newick=tree.write(format=9))
        for l in DBSession.query(common.Language).filter(
                common.Language.id.in_(nodes)):
            LanguageTreeLabel(language=l,
                              treelabel=TreeLabel(id=l.id,
                                                  name=l.id,
                                                  phylogeny=phylo))
        DBSession.add(phylo)
コード例 #43
0
ファイル: test_db_models.py プロジェクト: cevmartinez/clld
    def test_Files(self):
        from clld.db.models.common import Sentence, Sentence_files

        if PY3:
            return  # pragma: no cover

        l = Sentence(id='abc', name='Name')
        f = Sentence_files(object=l, id='abstract', mime_type='audio/mpeg')
        p = f.create(
            Path(mkdtemp()).joinpath('clldtest').as_posix(), 'content')
        assert os.path.exists(p)
        rmtree(Path(p).parent.parent)
        l._files.append(f)
        DBSession.add(l)
        DBSession.flush()
        DBSession.refresh(l)
        assert l.files
        assert l.audio
コード例 #44
0
ファイル: test_db_models.py プロジェクト: Woseseltops/clld
    def test_Files(self):
        from clld.db.models.common import Sentence, Sentence_files
        from path import path

        if PY3:
            return  # pragma: no cover

        l = Sentence(id='abc', name='Name')
        f = Sentence_files(object=l, id='abstract', mime_type='audio/mpeg')
        p = f.create(path(gettempdir()), 'content')
        assert os.path.exists(p)
        os.remove(p)
        l._files.append(f)
        DBSession.add(l)
        DBSession.flush()
        DBSession.refresh(l)
        assert l.files
        assert l.audio
コード例 #45
0
ファイル: test_db_models.py プロジェクト: Woseseltops/clld
    def test_Files(self):
        from clld.db.models.common import Sentence, Sentence_files
        from path import path

        if PY3:
            return  # pragma: no cover

        l = Sentence(id='abc', name='Name')
        f = Sentence_files(object=l, id='abstract', mime_type='audio/mpeg')
        p = f.create(path(gettempdir()), 'content')
        assert os.path.exists(p)
        os.remove(p)
        l._files.append(f)
        DBSession.add(l)
        DBSession.flush()
        DBSession.refresh(l)
        assert l.files
        assert l.audio
コード例 #46
0
ファイル: submission.py プロジェクト: clld/dictionaria
 def add_file(self, type_, checksum, file_cls, obj, attrs=None):
     if checksum in self.cdstar:
         jsondata = {k: v for k, v in self.props.get(type_, {}).items()}
         jsondata.update(self.cdstar[checksum])
         if attrs:
             jsondata.update(attrs)
         f = file_cls(
             id='%s-%s' % (obj.id, checksum),
             name=self.cdstar[checksum]['original'],
             object_pk=obj.pk,
             mime_type=self.cdstar[checksum]['mimetype'],
             jsondata=jsondata)
         DBSession.add(f)
         DBSession.flush()
         DBSession.refresh(f)
         return
     print('{0} file missing: {1}'.format(type_, checksum))
     return
コード例 #47
0
ファイル: test_db_meta.py プロジェクト: cevmartinez/clld
 def test_CsvMixin(self):
     l1 = Language(id='abc',
                   name='Name',
                   latitude=12.4,
                   jsondata=dict(a=None))
     DBSession.add(l1)
     DBSession.flush()
     l1 = Language.csv_query(DBSession).first()
     cols = l1.csv_head()
     row = l1.to_csv()
     for k, v in zip(cols, row):
         if k == 'jsondata':
             self.assertIn('a', json.loads(v))
     l2 = Language.from_csv(row)
     assert_almost_equal(l1.latitude, l2.latitude)
     row[cols.index('latitude')] = '3,5'
     l2 = Language.from_csv(row)
     self.assertLess(l2.latitude, l1.latitude)
コード例 #48
0
    def test_freeze(self):
        from clld.scripts.freeze import freeze_func, unfreeze_func

        tmp = Path(mkdtemp())
        tmp.joinpath('data').mkdir()
        tmp.joinpath('appname').mkdir()

        class Args(object):
            env = self.env
            module_dir = tmp.joinpath('appname').resolve()
            module = Mock(__name__='appname')

            def data_file(self, *comps):
                return tmp.resolve().joinpath('data', *comps)

        DBSession.flush()
        args = Args()
        freeze_func(args, dataset=Dataset.first(), with_history=False)
        self.assert_(tmp.joinpath('data.zip').exists())

        engine = create_engine('sqlite://')
        Base.metadata.create_all(engine)
        self.assertEqual(
            engine.execute('select count(*) from language').fetchone()[0], 0)
        unfreeze_func(args, engine=engine)

        s1 = DBSession
        s2 = sessionmaker(bind=engine)()
        self.assertEqual(
            s1.query(Language).count(),
            s2.query(Language).count())

        l1 = s1.query(Language).filter(Language.latitude != null()).first()
        l2 = s2.query(Language).filter(Language.pk == l1.pk).first()
        self.assertEqual(l1.created, l2.created)
        self.assertEqual(l1.latitude, l2.latitude)
        self.assertEqual(l1.description, l2.description)

        contrib = s2.query(Contribution).filter(
            Contribution.id == 'contribution').one()
        self.assert_(contrib.primary_contributors)
        self.assert_(contrib.secondary_contributors)

        rmtree(tmp, ignore_errors=True)
コード例 #49
0
ファイル: test_db_models.py プロジェクト: mitcho/clld
    def test_UnitValue(self):
        from clld.db.models.common import UnitParameter, UnitValue, UnitDomainElement

        p1 = UnitParameter()
        p2 = UnitParameter()
        v = UnitValue(
            unitdomainelement=UnitDomainElement(parameter=p1, name='ude'))
        self.assertEqual(str(v), 'ude')
        DBSession.add(v)
        DBSession.add(p2)
        DBSession.flush()
        try:
            v.unitparameter_pk = p2.pk
            raise ValueError  # pragma: no cover
        except AssertionError:
            pass

        v.unitparameter_pk = p1.pk
        DBSession.flush()
コード例 #50
0
    def test_freeze(self):
        from clld.scripts.freeze import freeze_func, unfreeze_func

        tmp = Path(mkdtemp())
        tmp.joinpath('data').mkdir()
        tmp.joinpath('appname').mkdir()

        class Args(object):
            env = self.env
            module_dir = tmp.joinpath('appname').resolve()
            module = Mock(__name__='appname')

            def data_file(self, *comps):
                return tmp.resolve().joinpath('data', *comps)

        DBSession.flush()
        args = Args()
        freeze_func(args, dataset=Dataset.first(), with_history=False)
        self.assert_(tmp.joinpath('data.zip').exists())

        engine = create_engine('sqlite://')
        Base.metadata.create_all(engine)
        self.assertEqual(
            engine.execute('select count(*) from language').fetchone()[0], 0)
        unfreeze_func(args, engine=engine)

        s1 = DBSession
        s2 = sessionmaker(bind=engine)()
        self.assertEqual(s1.query(Language).count(), s2.query(Language).count())

        l1 = s1.query(Language).filter(Language.latitude != null()).first()
        l2 = s2.query(Language).filter(Language.pk == l1.pk).first()
        self.assertEqual(l1.created, l2.created)
        self.assertEqual(l1.latitude, l2.latitude)
        self.assertEqual(l1.description, l2.description)

        contrib = s2.query(Contribution).filter(Contribution.id == 'contribution').one()
        self.assert_(contrib.primary_contributors)
        self.assert_(contrib.secondary_contributors)

        rmtree(tmp, ignore_errors=True)
コード例 #51
0
ファイル: test_db_models.py プロジェクト: clld/clld
def test_UnitValue(db):
    from clld.db.models.common import Unit, Language, UnitParameter, UnitValue, UnitDomainElement

    u = Unit(name='unit', language=Language(name='language'))
    p1 = UnitParameter()
    p2 = UnitParameter()
    # NOTE: we assume paramter of UnitValue and UnitDomainElement are identical
    #       (i.e. we do not enforce/check this)
    v = UnitValue(
        unit=u, unitparameter=p1,
        unitdomainelement=UnitDomainElement(parameter=p1, name='ude'))
    assert str(v) == 'ude'
    DBSession.add(v)
    DBSession.add(p2)
    DBSession.flush()
    try:
        v.unitparameter_pk = p2.pk
        raise ValueError  # pragma: no cover
    except AssertionError:
        pass

    v.unitparameter_pk = p1.pk
    DBSession.flush()
コード例 #52
0
ファイル: util.py プロジェクト: Anaphory/lexibank
def import_cldf(srcdir, md, languoids, conceptsets):
    with transaction.manager:
        contrib = Provider(
            id=srcdir.name,
            name=md['dc:title'],
            description=md.get('dc:bibliographicCitation'),
            url=md.get('dc:identifier'),
            license=md.get('dc:license'),
            aboutUrl=md.get('aboutUrl'),
        )
        DBSession.add(contrib)
        sources = {}
        cldfdir = srcdir.joinpath('cldf')
        values = Data()
        for fname in tqdm(list(cldfdir.glob('*' + MD_SUFFIX)), leave=False):
            ds = Dataset.from_metadata(fname)
            for src in ds.sources.items():
                if src.id not in sources:
                    sources[src.id] = cldf2clld(src, contrib, len(sources) + 1)
            import_dataset(ds, contrib, languoids, conceptsets, sources, values)
            DBSession.flush()
        # import cognates:
        if cldfdir.joinpath('cognates.csv').exists():
            for csid, cognates in groupby(
                    reader(cldfdir.joinpath('cognates.csv'), dicts=True),
                    lambda i: i['Cognate_set_ID']):
                cs = Cognateset(id=unique_id(contrib, csid), contribution=contrib)
                for cognate in cognates:
                    cp = values['Counterpart'].get(cognate['Word_ID'])
                    if cp:
                        DBSession.add(CognatesetCounterpart(
                            cognateset=cs,
                            counterpart=cp,
                            cognate_detection_method=cognate['Cognate_detection_method'],
                            alignment=cognate['Alignment'],
                            alignment_method=cognate['Alignment_method'],
                            doubt=cognate['Doubt'] == 'True'))
コード例 #53
0
def main(args):
    data = Data()

    files_dir.rmtree()
    files_dir.mkdir()

    editors = OrderedDict()
    editors['Susanne Maria Michaelis'] = None
    editors['Philippe Maurer'] = None
    editors['Martin Haspelmath'] = None
    editors['Magnus Huber'] = None

    for row in read('People'):
        name = row['First name'] + ' ' if row['First name'] else ''
        name += row['Last name']
        kw = dict(
            name=name,
            id=slug('%(Last name)s%(First name)s' % row),
            url=row['Contact Website'].split()[0] if row['Contact Website'] else None,
            address=row['Comments on database'],
        )
        contrib = data.add(common.Contributor, row['Author ID'], **kw)
        if kw['name'] in editors:
            editors[kw['name']] = contrib

    DBSession.flush()

    dataset = common.Dataset(
        id='apics',
        name='APiCS Online',
        description='Atlas of Pidgin and Creole Language Structures Online',
        domain='apics-online.info',
        published=date(2013, 8, 15),
        #
        # TODO: switch license!
        #
        license='http://creativecommons.org/licenses/by/3.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 3.0 Unported License'})
    DBSession.add(dataset)
    for i, editor in enumerate(editors.values()):
        common.Editor(dataset=dataset, contributor=editor, ord=i + 1)

    colors = dict((row['ID'], row['RGB_code']) for row in read('Colours'))

    abbrs = {}
    for id_, name in LGR_ABBRS.items():
        DBSession.add(common.GlossAbbreviation(id=id_, name=name))
        abbrs[id_] = 1

    for id_, name in {
        'C**T': 'clitic',
        'IMPF': 'imperfect',
        'INTERM': 'intermediate',
        'NCOMPL': 'noncompletive',
        'NONFUT': 'nonfuture',
        'NPROX': 'nonproximal',
        'NSG': 'nonsingular',
        'PP': 'past participle',
        'PROP': 'proprietive',
        'TMA': 'tense-mood-aspect',
    }.items():
        DBSession.add(common.GlossAbbreviation(id=id_, name=name))
        abbrs[id_] = 1

    with open(data_dir.joinpath('non-lgr-gloss-abbrs.csv'), 'rb') as csvfile:
        for row in csv.reader(csvfile):
            for match in GLOSS_ABBR_PATTERN.finditer(row[1]):
                if match.group('abbr') not in abbrs:
                    abbrs[match.group('abbr')] = 1
                    DBSession.add(
                        common.GlossAbbreviation(id=match.group('abbr'), name=row[0]))

    non_bibs = {}
    for row in read('References', 'Reference_ID'):
        if row['Reference_type'] == 'Non-bib':
            non_bibs[row['Reference_ID']] = row['Reference_name']
            continue

        if isinstance(row['Year'], int):
            year_int = row['Year']
            year = str(row['Year'])
        elif row['Year']:
            year_int = None
            for m in re.finditer('(?P<year>(1|2)[0-9]{3})', row['Year']):
                year_int = int(m.group('year'))
                break
            year = row['Year']
        else:
            year, year_int = None, None

        title = row['Article_title'] or row['Book_title']
        attrs = {}
        jsondata = {}
        for attr, field in {
            'Additional_information': 'note',
            'Article_title': 'title',
            'Book_title': 'booktitle',
            'City': 'address',
            'Editors': 'editor',
            'Full_reference': None,
            'Issue': None,
            'Journal': 'journal',
            'Language_codes': None,
            'LaTeX_cite_key': None,
            'Pages': 'pages',
            'Publisher': 'publisher',
            'Reference_type': 'type',
            'School': 'school',
            'Series_title': 'series',
            'URL': 'url',
            'Volume': 'volume',
        }.items():
            value = row.get(attr)
            if not isinstance(value, int):
                value = (value or '').strip()
            if attr == 'Issue' and value:
                try:
                    value = str(int(value))
                except ValueError:
                    pass
            if value:
                if field:
                    attrs[field] = value
                else:
                    jsondata[attr] = value
        p = data.add(
            common.Source, row['Reference_ID'],
            id=row['Reference_ID'],
            name=row['Reference_name'],
            description=title,
            author=row['Authors'],
            year=year,
            year_int=year_int,
            bibtex_type=getattr(EntryType, row['BibTeX_type'] or 'misc'),
            jsondata=jsondata,
            **attrs)
        if p.bibtex_type.value == 'misc' and not p.description:
            p.description = p.note
        DBSession.flush()

    DBSession.flush()

    gt = {}
    p = re.compile('[0-9]+\_(?P<name>[^\_]+)\_(GT|Text)')
    for d in data_dir.joinpath('gt').files():
        m = p.search(unicode(d.basename()))
        if m:
            for part in m.group('name').split('&'):
                # make sure we prefer files named "Text_for_soundfile"
                if slug(unicode(part)) not in gt or 'Text_for_' in d.basename():
                    gt[slug(unicode(part))] = d
    gt_audio = {}
    p = re.compile('(?P<name>[^\.]+)\.mp3')
    for d in data_dir.joinpath('gt', 'audio').files():
        m = p.search(unicode(d.basename()))
        assert m
        for part in m.group('name').split('&'):
            gt_audio[slug(unicode(part))] = d

    with open(args.data_file('infobox.json')) as fp:
        infobox = json.load(fp)
    for row in read('Languages', 'Order_number'):
        lon, lat = [float(c.strip()) for c in row['map_coordinates'].split(',')]
        kw = dict(
            name=row['Language_name'],
            id=str(row['Order_number']),
            latitude=lat,
            longitude=lon,
            region=row['Category_region'],
            #base_language=row['Category_base_language'],
        )
        lect = data.add(models.Lect, row['Language_ID'], **kw)
        DBSession.flush()

        for i, item in enumerate(infobox[lect.id]):
            DBSession.add(common.Language_data(
                object_pk=lect.pk, ord=i, key=item[0], value=item[1]))

        if row["Languages_contribution_documentation::Lect_description_checked_status"] == "Checked":
            desc = row.get('Languages_contribution_documentation::Lect description', '')
        else:
            desc = ''

        c = data.add(
            models.ApicsContribution, row['Language_ID'],
            id=row['Order_number'],
            name=row['Language_name'],
            description=desc,
            survey_reference=data['Source'][row['Survey_reference_ID']],
            language=lect)

        if slug(row['Language_name']) in gt:
            f = common.Contribution_files(
                object=c, id='%s-gt.pdf' % c.id, name='Glossed text', mime_type='application/pdf')
            f.create(files_dir, file(gt[slug(row['Language_name'])]).read())
        else:
            print '--- no glossed text for:', row['Language_name']
        if slug(row['Language_name']) in gt_audio:
            f = common.Contribution_files(
                object=c, id='%s-gt.mp3' % c.id, name='Glossed text audio', mime_type='audio/mpeg')
            f.create(files_dir, file(gt_audio[slug(row['Language_name'])]).read())
        else:
            print '--- no audio for:', row['Language_name']

        #
        # TODO: for michif, 75, add link http://www.youtube.com/watch?v=f0C4cODsSyE
        #

        iso = None
        if row['ISO_code'] and len(row['ISO_code']) == 3:
            iso = row['ISO_code'].lower()
            if 'iso:%s' % row['ISO_code'] not in data['Identifier']:
                data.add(
                    common.Identifier, 'iso:%s' % row['ISO_code'],
                    id=row['ISO_code'].lower(),
                    name=row['ISO_code'].lower(),
                    type='iso639-3')

            DBSession.add(common.LanguageIdentifier(
                language=data['Lect'][row['Language_ID']],
                identifier=data['Identifier']['iso:%s' % row['ISO_code']]))

        if row['Language_name_ethnologue']:
            if row['Language_name_ethnologue'] not in data['Identifier']:
                data.add(
                    common.Identifier, row['Language_name_ethnologue'],
                    id=iso or 'ethnologue:%s' % row['Language_name_ethnologue'],
                    name=row['Language_name_ethnologue'],
                    type='ethnologue')

            DBSession.add(common.LanguageIdentifier(
                language=data['Lect'][row['Language_ID']],
                identifier=data['Identifier'][row['Language_name_ethnologue']]))

    example_count = {}
    soundfiles = {}
    for p in data_dir.joinpath('Soundfiles_Examples').files():
        soundfiles[p.namebase] = p
    for row in read('Examples', 'Order_number'):
        assert row['Language_ID']
        lang = data['Lect'][row['Language_ID']]
        id_ = '%(Language_ID)s-%(Example_number)s' % row
        atext, gloss = igt(row)
        example_count[row['Language_ID']] = max([example_count.get(row['Language_ID'], 1), row['Example_number']])
        p = data.add(
            common.Sentence, id_,
            id='%s-%s' % (lang.id, row['Example_number']),
            name=row['Text'] or row['Analyzed_text'],
            description=row['Translation'],
            type=row['Type'].strip().lower() if row['Type'] else None,
            comment=row['Comments'],
            gloss=gloss,
            analyzed=atext,
            markup_text=normalize_markup(row['z_calc_Text_CSS']),
            markup_gloss=normalize_markup(row['z_calc_Gloss_CSS']),
            markup_comment=normalize_markup(row['z_calc_Comments_CSS']),
            markup_analyzed=normalize_markup(row['z_calc_Analyzed_text_CSS']),
            original_script=row['Original_script'],
            jsondata={'sort': row['Order_number']},
            language=lang)

        if id_ in soundfiles:
            #print '---> sound', id_
            f = common.Sentence_files(
                object=p, id='%s.mp3' % p.id, name='Audio', mime_type='audio/mpeg')
            f.create(files_dir, file(soundfiles[id_]).read())

        if row['Reference_ID']:
            if row['Reference_ID'] in data['Source']:
                source = data['Source'][row['Reference_ID']]
                DBSession.add(common.SentenceReference(
                    sentence=p,
                    source=source,
                    key=source.id,
                    description=row['Reference_pages'],
                ))
            else:
                p.source = non_bibs[row['Reference_ID']]

    DBSession.flush()

    for row in read('Language_references'):
        if row['Reference_ID'] not in data['Source']:
            assert row['Reference_ID'] in non_bibs
            continue
        assert row['Language_ID'] in data['ApicsContribution']
        source = data['Source'][row['Reference_ID']]
        DBSession.add(common.ContributionReference(
            contribution=data['ApicsContribution'][row['Language_ID']],
            source=source,
            description=row['Pages'],
            key=source.id))

    #
    # global counter for features - across feature types
    #
    feature_count = 0
    for row in read('Features', 'Feature_number'):
        id_ = str(row['Feature_number'])
        if int(id_) > feature_count:
            feature_count = int(id_)
        wals_id = None
        if row['WALS_match'] == 'Total':
            if isinstance(row['WALS_No.'], int):
                wals_id = row['WALS_No.']
            else:
                wals_id = int(row['WALS_No.'].split('.')[0].strip())

        p = data.add(
            models.Feature, row['Feature_code'],
            name=row['Feature_name'],
            id=id_,
            description=row['Feature_annotation_publication'],
            markup_description=normalize_markup(row['z_calc_Feature_annotation_publication_CSS']),
            feature_type='primary',
            multivalued=row['Value_relation_type'] != 'Single',
            area=row['Feature_area'],
            wals_id=wals_id)

        names = {}
        for i in range(1, 10):
            if not row['Value%s_publication' % i] or not row['Value%s_publication' % i].strip():
                continue
            name = row['Value%s_publication' % i].strip()
            if name in names:
                name += ' (%s)' % i
            names[name] = 1
            de = data.add(
                common.DomainElement, '%s-%s' % (row['Feature_code'], i),
                id='%s-%s' % (id_, i),
                name=name,
                parameter=p,
                abbr=row['Value%s_for_book_maps' % i] if p.id != '0' else name,
                number=int(row['Value%s_value_number_for_publication' % i]),
                jsondata={'color': colors[row['Value_%s_colour_ID' % i]]},
            )

        if row['Authors_FeatureArticles']:
            authors, _ = row['Authors_FeatureArticles'].split('and the APiCS')
            authors = authors.strip()
            if authors.endswith(','):
                authors = authors[:-1].strip()
            for i, name in enumerate(authors.split(',')):
                assert name.strip() in editors
                p._authors.append(models.FeatureAuthor(
                    ord=i + 1, contributor=editors[name.strip()]))

        DBSession.flush()

    primary_to_segment = {123: 63, 126: 35, 128: 45, 130: 41}
    segment_to_primary = dict(zip(
        primary_to_segment.values(), primary_to_segment.keys()))
    number_map = {}
    names = {}
    for row in read('Segment_features', 'Order_number'):
        symbol = row['Segment_symbol']
        if row['Segment_name'] == 'voiceless dental/alveolar sibilant affricate':
            symbol = 't\u0361s'
        truth = lambda s: s and s.strip().lower() == 'yes'
        name = '%s - %s' % (symbol, row['Segment_name'])

        if name in names:
            number_map[row['Segment_feature_number']] = names[name]
            continue

        number_map[row['Segment_feature_number']] = row['Segment_feature_number']
        names[name] = row['Segment_feature_number']
        feature_count += 1
        if row['Segment_feature_number'] in segment_to_primary:
            primary_to_segment[segment_to_primary[row['Segment_feature_number']]] = str(feature_count)
        p = data.add(
            models.Feature, row['Segment_feature_number'],
            name=name,
            id=str(feature_count),
            feature_type='segment',
            area='Vowels' if truth(row['Vowel']) else (
                'Obstruent consonants' if truth(row['Obstruent'])
                else 'Sonorant consonants'),
            jsondata=dict(
                number=int(row['Segment_feature_number']),
                vowel=truth(row['Vowel']),
                consonant=truth(row['Consonant']),
                obstruent=truth(row['Obstruent']),
                core_list=truth(row['Core_list_segment']),
                symbol=symbol,
            ))

        for i, spec in SEGMENT_VALUES.items():
            data.add(
                common.DomainElement,
                '%s-%s' % (row['Segment_feature_number'], spec[0]),
                id='%s-%s' % (p.id, i),
                name=spec[0],
                parameter=p,
                jsondata={'color': spec[1]},
                number=i)

    print '--> remapped:', primary_to_segment
    DBSession.flush()

    for row in read('Sociolinguistic_features', 'Sociolinguistic_feature_number'):
        feature_count += 1
        p = data.add(
            models.Feature, row['Sociolinguistic_feature_code'],
            name=row['Sociolinguistic_feature_name'],
            id='%s' % feature_count,
            area='Sociolinguistic',
            feature_type='sociolinguistic')

        names = {}

        for i in range(1, 7):
            id_ = '%s-%s' % (row['Sociolinguistic_feature_code'], i)
            if row['Value%s' % i] and row['Value%s' % i].strip():
                name = row['Value%s' % i].strip()
                if name in names:
                    name += ' (%s)' % i
                names[name] = 1
            else:
                name = '%s - %s' % (row['Sociolinguistic_feature_name'], i)
            kw = dict(id='%s-%s' % (p.id, i), name=name, parameter=p, number=i)
            de = data.add(
                common.DomainElement,
                id_,
                id='%s-%s' % (p.id, i),
                name=name,
                parameter=p,
                number=i,
                jsondata={'color': colors.values()[i]})

    sd = {}
    soundfiles = {}
    for p in data_dir.joinpath('Soundfiles_Segments').files():
        soundfiles[p.namebase] = p
    for row in read('Segment_data'):
        if row['Segment_feature_number'] not in number_map:
            continue
        number = number_map[row['Segment_feature_number']]

        #Language_ID,Segment_feature_number,Comments,Audio_file_name,Example_word,
        #Example_word_gloss,Presence_in_the_language,Refers_to_references_Reference_ID
        if not row['Presence_in_the_language']:
            continue

        lang = data['Lect'][row['Language_ID']]
        param = data['Feature'][number]
        id_ = '%s-%s' % (lang.id, param.id)
        if id_ in sd:
            assert row['c_Record_is_a_duplicate'] == 'Yes'
            continue
        sd[id_] = 1
        valueset = data.add(
            common.ValueSet,
            id_,
            id=id_,
            parameter=param,
            language=lang,
            contribution=data['ApicsContribution'][row['Language_ID']],
            description=row['Comments'],
            markup_description=normalize_markup(row['z_calc_Comments_CSS']),
        )
        v = data.add(
            common.Value,
            id_,
            id=id_,
            frequency=float(100),
            valueset=valueset,
            domainelement=data['DomainElement']['%s-%s' % (
                number, row['Presence_in_the_language'])],
        )
        if row['Example_word'] and row['Example_word_gloss']:
            example_count[row['Language_ID']] += 1
            p = data.add(
                common.Sentence, '%s-p%s' % (lang.id, data['Feature'][number].id),
                id='%s-%s' % (lang.id, example_count[row['Language_ID']]),
                name=row['Example_word'],
                description=row['Example_word_gloss'],
                language=lang)

            sid = '%(Language_ID)s-%(Segment_feature_number)s' % row
            if sid in soundfiles:
                print '---> sound', sid
                f = common.Sentence_files(
                    object=p, id='%s.mp3' % p.id, name='Audio', mime_type='audio/mpeg')
                f.create(files_dir, file(soundfiles[sid]).read())

            DBSession.add(common.ValueSentence(value=v, sentence=p))

        source = data['Source'].get(row['Refers_to_references_Reference_ID'])
        if source:
            DBSession.add(common.ValueSetReference(
                valueset=valueset, source=source, key=source.id))
        elif row['Refers_to_references_Reference_ID'] in non_bibs:
            valueset.source = non_bibs[row['Refers_to_references_Reference_ID']]

    lects = defaultdict(lambda: 1)
    lect_map = {}
    records = {}
    false_values = {}
    no_values = {}
    wals_value_number = {}
    for row in read('wals'):
        if row['z_calc_WALS_value_number']:
            wals_value_number[row['Data_record_id']] = row['z_calc_WALS_value_number']

    def prefix(attr, _prefix):
        if _prefix:
            return '%s_%s' % (_prefix, attr)
        return attr.capitalize()

    for _prefix, abbr, num_values in [
        ('', '', 10),
        ('Sociolinguistic', 'sl', 7),
    ]:
        for row in read(prefix('data', _prefix)):
            if not row[prefix('feature_code', _prefix)]:
                print 'no associated feature for', prefix('data', _prefix), row[prefix('data_record_id', _prefix)]
                continue

            lid = row['Language_ID']
            lect_attr = row.get('Lect_attribute', 'my default lect').lower()
            if lect_attr != 'my default lect':
                if (row['Language_ID'], row['Lect_attribute']) in lect_map:
                    lid = lect_map[(row['Language_ID'], row['Lect_attribute'])]
                else:
                    lang = data['Lect'][row['Language_ID']]
                    c = lects[row['Language_ID']]
                    lid = '%s-%s' % (row['Language_ID'], c)
                    kw = dict(
                        name='%s (%s)' % (lang.name, row['Lect_attribute']),
                        id='%s' % (1000 + 10 * int(lang.id) + c),
                        latitude=lang.latitude,
                        longitude=lang.longitude,
                        description=row['Lect_attribute'],
                        language=lang,
                    )
                    data.add(models.Lect, lid, **kw)
                    lects[row['Language_ID']] += 1
                    lect_map[(row['Language_ID'], row['Lect_attribute'])] = lid

            id_ = abbr + str(row[prefix('data_record_id', _prefix)])
            assert id_ not in records
            records[id_] = 1

            assert row[prefix('feature_code', _prefix)] in data['Feature']
            #if row[prefix('feature_code', _prefix)] not in data['Feature']:
            #    print row[prefix('feature_code', _prefix)]
            #    print str(row[prefix('data_record_id', _prefix)])
            #    raise ValueError
            language = data['Lect'][lid]
            parameter = data['Feature'][row[prefix('feature_code', _prefix)]]
            valueset = common.ValueSet(
                id='%s-%s' % (language.id, parameter.id),
                description=row['Comments_on_value_assignment'],
                markup_description=normalize_markup(row.get('z_calc_Comments_on_value_assignment_CSS')),
            )

            values_found = {}
            for i in range(1, num_values):
                if not row['Value%s_true_false' % i]:
                    continue

                if row['Value%s_true_false' % i].strip().lower() != 'true':
                    assert row['Value%s_true_false' % i].strip().lower() == 'false'
                    false_values[row[prefix('data_record_id', _prefix)]] = 1
                    continue

                values_found['%s-%s' % (id_, i)] = dict(
                    id='%s-%s' % (valueset.id, i),
                    #valueset=valueset,
                    domainelement=data['DomainElement']['%s-%s' % (
                        row[prefix('feature_code', _prefix)], i)],
                    confidence=row['Value%s_confidence' % i],
                    frequency=float(row['c_V%s_frequency_normalised' % i])
                    if _prefix == '' else 100)

            if values_found:
                if row[prefix('data_record_id', _prefix)] in wals_value_number:
                    valueset.jsondata = {'wals_value_number': wals_value_number.pop(row[prefix('data_record_id', _prefix)])}
                valueset.parameter = parameter
                valueset.language = language
                valueset.contribution = data['ApicsContribution'][row['Language_ID']]
                valueset = data.add(common.ValueSet, id_, _obj=valueset)
                for i, item in enumerate(values_found.items()):
                    if i > 0 and not parameter.multivalued:
                        print 'multiple values for single-valued parameter: %s' % id_
                        break
                    id_, kw = item
                    kw['valueset'] = valueset
                    value = data.add(common.Value, id_, **kw)

                #
                # store references to additional data for segments which should be reused
                # for corresponding primary features!
                #
                if int(parameter.id) in primary_to_segment:
                    assert len(values_found) == 1
                    seg_id = '%s-%s' % (language.id, primary_to_segment[int(parameter.id)])
                    seg_valueset = data['ValueSet'][seg_id]
                    seg_value = data['Value'][seg_id]
                    if not valueset.description and seg_valueset.description:
                        valueset.description = seg_valueset.description

                    for s in seg_value.sentence_assocs:
                        DBSession.add(common.ValueSentence(value=value, sentence=s.sentence))

                    for r in seg_valueset.references:
                        DBSession.add(common.ValueSetReference(
                            valueset=valueset, source=r.source, key=r.key))

                    if not valueset.source and seg_valueset.source:
                        valueset.source = seg_valueset.source

                DBSession.flush()
            else:
                no_values[id_] = 1

    DBSession.flush()

    for prefix, abbr, num_values in [
        ('D', '', 10),
        ('Sociolinguistic_d', 'sl', 7),
    ]:
        for row in read(prefix + 'ata_references'):
            assert row['Reference_ID'] in data['Source'] or row['Reference_ID'] in non_bibs
            try:
                vs = data['ValueSet'][abbr + str(row[prefix + 'ata_record_id'])]
                if row['Reference_ID'] in data['Source']:
                    source = data['Source'][row['Reference_ID']]
                    DBSession.add(common.ValueSetReference(
                        valueset=vs,
                        source=source,
                        key=source.id,
                        description=row['Pages'],
                    ))
                else:
                    if vs.source:
                        vs.source += '; ' + non_bibs[row['Reference_ID']]
                    else:
                        vs.source = non_bibs[row['Reference_ID']]
            except KeyError:
                print('Reference for unknown dataset: %s'
                      % row[prefix + 'ata_record_id'])
                continue

    DBSession.flush()

    missing = 0
    for row in read('Value_examples'):
        try:
            DBSession.add(common.ValueSentence(
                value=data['Value']['%(Data_record_id)s-%(Value_number)s' % row],
                sentence=data['Sentence']['%(Language_ID)s-%(Example_number)s' % row],
                description=row['Notes'],
            ))
        except KeyError:
            missing += 1
    print('%s Value_examples are missing data' % missing)

    print('%s data sets with false values' % len(false_values))
    print('%s data sets without values' % len(no_values))

    for k, v in wals_value_number.items():
        print 'unclaimed wals value number:', k, v

    for i, row in enumerate(read('Contributors')):
        kw = dict(
            contribution=data['ApicsContribution'][row['Language ID']],
            contributor=data['Contributor'][row['Author ID']]
        )
        if row['Order_of_appearance']:
            kw['ord'] = int(float(row['Order_of_appearance']))
        data.add(common.ContributionContributor, i, **kw)

    DBSession.flush()
コード例 #54
0
ファイル: initializedb.py プロジェクト: Castroyesid/wals3
def main(args):  # pragma: no cover
    glottocodes = {}
    for row in GC.execute("select ll.hid, l.id from language as l, languoid as ll where ll.pk = l.pk"):
        if row[0] and len(row[0]) == 3:
            glottocodes[row[0]] = row[1]

    icons = issues.Icons()
    old_db = DB

    vs2008 = get_vs2008(args)

    missing_sources = []
    refdb_ids = {}
    max_id = 7350
    with open("/home/robert/venvs/clld/data/wals-data/missing_source.py", "w") as fp:
        for row in old_db.execute("select * from reference"):
            try:
                author, year = row["id"].split("-")
            except:
                author, year = None, None
            bibdata = get_source(row["id"])
            if not bibdata:
                fp.write('"%s",\n' % row["id"])
                missing_sources.append(row["id"])
                bibdata["pk"] = max_id
                max_id += 1

            if bibdata["pk"] in refdb_ids:
                print("already seen:", row["id"], "as", refdb_ids[bibdata["pk"]])
                data["Source"][row["id"]] = data["Source"][refdb_ids[bibdata["pk"]]]
                continue
            refdb_ids[bibdata["pk"]] = row["id"]

            bibdata.update(
                {
                    "id": row["id"],
                    "name": row["name"],
                    "description": bibdata.get("title", bibdata.get("booktitle")),
                    "google_book_search_id": row["gbs_id"] or None,
                }
            )
            data.add(common.Source, row["id"], **bibdata)

        #
        # TODO: add additional bibdata as data items
        #

    print("sources missing for %s refs" % len(missing_sources))

    for id, name in ABBRS.items():
        DBSession.add(common.GlossAbbreviation(id=id, name=name))

    migrate("country", models.Country, lambda r: (r["id"], dict(id=r["id"], name=r["name"], continent=r["continent"])))

    migrate("family", models.Family, lambda r: (r["id"], dict(id=r["id"], name=r["name"], description=r["comment"])))

    for row, icon in zip(list(old_db.execute("select * from genus order by family_id")), cycle(iter(icons))):
        genus = data.add(models.Genus, row["id"], id=row["id"], name=row["name"], icon=icon, subfamily=row["subfamily"])
        genus.family = data["Family"][row["family_id"]]
    DBSession.flush()

    migrate(
        "altname",
        common.Identifier,
        lambda r: ((r["name"], r["type"]), dict(name=r["name"], type="name", description=r["type"])),
    )

    # names for isolanguages are not unique!
    enames = {}
    for r in DB.execute("select * from isolanguage"):
        id_ = "ethnologue-%s" % r["id"]
        if r["name"] in enames:
            data["Identifier"][id_] = enames[r["name"]]
        else:
            enames[r["name"]] = data.add(
                common.Identifier, id_, id=id_, name=r["name"], type="name", description="ethnologue"
            )
    DBSession.flush()

    migrate(
        "isolanguage",
        common.Identifier,
        lambda r: (
            r["id"],
            dict(id=r["id"], name=r["id"], type=common.IdentifierType.iso.value, description=r["name"]),
        ),
    )

    migrate(
        "isolanguage",
        common.Identifier,
        lambda r: None
        if r["id"] not in glottocodes
        else (
            "gc-%s" % r["id"],
            dict(
                id="gc-%s" % r["id"],
                name=glottocodes[r["id"]],
                type=common.IdentifierType.glottolog.value,
                description=r["name"],
            ),
        ),
    )

    migrate(
        "language",
        models.WalsLanguage,
        lambda r: (
            r["id"],
            dict(
                id=r["id"],
                name=r["name"],
                latitude=r["latitude"],
                longitude=r["longitude"],
                ascii_name=r["ascii_name"],
                genus=data["Genus"][r["genus_id"]],
                samples_100=r["samples_100"] != 0,
                samples_200=r["samples_200"] != 0,
            ),
        ),
    )

    migrate(
        "author",
        common.Contributor,
        lambda r: (r["id"], dict(name=r["name"], url=r["www"], id=r["id"], description=r["note"])),
    )

    dataset = common.Dataset(
        id="wals",
        name="WALS Online",
        description="The World Atlas of Language Structures Online",
        domain="wals.info",
        published=date(2013, 8, 15),
        contact="*****@*****.**",
        license="http://creativecommons.org/licenses/by-nc-nd/2.0/de/deed.en",
        jsondata={
            "license_icon": "http://wals.info/static/images/cc_by_nc_nd.png",
            "license_name": "Creative Commons Attribution-NonCommercial-NoDerivs 2.0 Germany",
        },
    )
    DBSession.add(dataset)

    for i, editor in enumerate(["dryerms", "haspelmathm"]):
        common.Editor(dataset=dataset, contributor=data["Contributor"][editor], ord=i + 1)

    migrate(
        "country_language",
        models.CountryLanguage,
        lambda r: dict(
            language_pk=data["WalsLanguage"][r["language_id"]].pk, country_pk=data["Country"][r["country_id"]].pk
        ),
    )

    migrate(
        "altname_language",
        common.LanguageIdentifier,
        lambda r: dict(
            language=data["WalsLanguage"][r["language_id"]],
            identifier=data["Identifier"][(r["altname_name"], r["altname_type"])],
            description=r["relation"],
        ),
    )

    migrate(
        "isolanguage_language",
        common.LanguageIdentifier,
        lambda r: dict(
            language=data["WalsLanguage"][r["language_id"]],
            identifier=data["Identifier"][r["isolanguage_id"]],
            description=r["relation"],
        ),
    )

    migrate(
        "isolanguage_language",
        common.LanguageIdentifier,
        lambda r: None
        if "ethnologue-%s" % r["isolanguage_id"] not in data["Identifier"]
        else dict(
            language=data["WalsLanguage"][r["language_id"]],
            identifier=data["Identifier"]["ethnologue-%s" % r["isolanguage_id"]],
            description=r["relation"],
        ),
    )

    migrate(
        "isolanguage_language",
        common.LanguageIdentifier,
        lambda r: None
        if "gc-%s" % r["isolanguage_id"] not in data["Identifier"]
        else dict(
            language=data["WalsLanguage"][r["language_id"]],
            identifier=data["Identifier"]["gc-%s" % r["isolanguage_id"]],
            description=r["relation"],
        ),
    )

    migrate(
        "area", models.Area, lambda r: (r["id"], dict(name=r["name"], dbpedia_url=r["dbpedia_url"], id=str(r["id"])))
    )

    def migrate_chapter(row):
        kw = dict(
            id=row["id"],
            name=row["name"],
            wp_slug=row["blog_title"],
            sortkey=int(row["id"]),
            area=data["Area"][row["area_id"]],
        )
        if int(row["id"]) in [143, 144]:
            kw["created"] = E2011
            kw["updated"] = E2011
        return row["id"], kw

    migrate("chapter", models.Chapter, migrate_chapter)

    def migrate_supplement(row):
        if row["name"] not in ["Help", "Abbreviations"]:
            sortkey = 990 + int(row["id"]) if row["name"] != "Introduction" else 0
            id_ = "s%s" % row["id"]
            kw = dict(id=id_, name=row["name"], sortkey=sortkey)
            return id_, kw

    migrate("supplement", models.Chapter, migrate_supplement)

    migrate(
        "chapter_reference",
        common.ContributionReference,
        lambda r: dict(contribution=data["Chapter"][r["chapter_id"]], source=data["Source"][r["reference_id"]]),
    )

    migrate(
        "reference_supplement",
        common.ContributionReference,
        lambda r: dict(
            contribution=data["Chapter"]["s%s" % r["supplement_id"]], source=data["Source"][r["reference_id"]]
        ),
    )

    def migrate_feature(row):
        kw = dict(id=row["id"], name=row["name"], ordinal_qualifier=row["id"][-1])
        if row["id"].startswith("143") or row["id"].startswith("144"):
            kw["created"] = E2011
            kw["updated"] = E2011
        kw["chapter"] = data["Chapter"][row["chapter_id"]]
        return row["id"], kw

    migrate("feature", models.Feature, migrate_feature)

    def migrate_value(row):
        desc = row["description"]
        if desc == "SOV & NegV/VNeg":
            if row["icon_id"] != "s9ff":
                desc += " (a)"
            else:
                desc += " (b)"
        kw = dict(
            id="%s-%s" % (row["feature_id"], row["numeric"]),
            name=desc,
            description=row["long_description"],
            jsondata=dict(icon=issues.Icons.id(row["icon_id"])),
            number=row["numeric"],
            parameter=data["Feature"][row["feature_id"]],
        )
        return (row["feature_id"], row["numeric"]), kw

    migrate("value", common.DomainElement, migrate_value)

    same = 0
    added = 0
    for row in old_db.execute("select * from datapoint"):
        parameter = data["Feature"][row["feature_id"]]
        language = data["WalsLanguage"][row["language_id"]]
        id_ = "%s-%s" % (parameter.id, language.id)
        created = E2008
        updated = E2008

        value_numeric = row["value_numeric"]
        if (language.id, parameter.id) in vs2008:
            if vs2008[(language.id, parameter.id)] != row["value_numeric"]:
                print("~~~", id_, vs2008[(language.id, parameter.id)], "-->", row["value_numeric"])
                value_numeric = vs2008[(language.id, parameter.id)]
            else:
                same += 1
        else:
            updated = E2011
            created = E2011
            if parameter.id[-1] == "A" and not (parameter.id.startswith("143") or parameter.id.startswith("144")):
                added += 1

        kw = dict(id=id_, updated=updated, created=created)
        valueset = data.add(
            common.ValueSet, row["id"], language=language, parameter=parameter, contribution=parameter.chapter, **kw
        )
        data.add(
            common.Value,
            id_,
            domainelement=data["DomainElement"][(row["feature_id"], value_numeric)],
            valueset=valueset,
            **kw
        )

    print(same, "datapoints did not change")
    print(added, "datapoints added to existing features")

    DBSession.flush()

    migrate(
        "datapoint_reference",
        common.ValueSetReference,
        lambda r: dict(
            valueset=data["ValueSet"][r["datapoint_id"]],
            source=data["Source"][r["reference_id"]],
            description=r["note"],
        ),
    )

    migrate(
        "author_chapter",
        common.ContributionContributor,
        lambda r: dict(
            ord=r["order"],
            primary=r["primary"] != 0,
            contributor_pk=data["Contributor"][r["author_id"]].pk,
            contribution_pk=data["Chapter"][r["chapter_id"]].pk,
        ),
    )

    migrate(
        "author_supplement",
        common.ContributionContributor,
        lambda r: dict(
            ord=r["order"],
            primary=r["primary"] != 0,
            contributor_pk=data["Contributor"][r["author_id"]].pk,
            contribution_pk=data["Chapter"]["s%s" % r["supplement_id"]].pk,
        ),
    )

    igts = defaultdict(lambda: [])
    for row in old_db.execute("select * from igt"):
        d = {"id": "igt-%s" % row["id"]}
        d.update(parse_igt(row["xhtml"]))
        igts[row["example_id"]].append(d)

    for row in old_db.execute("select * from example"):
        if not row["language_id"]:
            print("example without language:", row["id"])
            continue

        _igts = igts[row["id"]]
        if _igts:
            for igt in _igts:
                data.add(
                    common.Sentence,
                    igt["id"],
                    markup_comment=row["xhtml"],
                    language=data["WalsLanguage"][row["language_id"]],
                    **igt
                )
        else:
            name = teaser(row["xhtml"])
            if name:
                data.add(
                    common.Sentence,
                    row["id"],
                    id=str(row["id"]),
                    name=name,
                    xhtml=row["xhtml"],
                    language=data["WalsLanguage"][row["language_id"]],
                )

    missing = {}
    for row in old_db.execute("select * from example_feature"):
        _igts = igts[row["example_id"]]
        if _igts:
            for igt in _igts:
                try:
                    sentence = data["Sentence"][igt["id"]]
                except KeyError:
                    print("missing sentence:", row["example_id"])
                    continue
                try:
                    value = data["Value"]["%s-%s" % (row["feature_id"], sentence.language.id)]
                    DBSession.add(common.ValueSentence(sentence=sentence, value=value))
                except KeyError:
                    missing[(row["feature_id"], sentence.language.id)] = 1
        else:
            try:
                sentence = data["Sentence"][row["example_id"]]
            except KeyError:
                print("missing sentence:", row["example_id"])
                continue
            try:
                value = data["Value"]["%s-%s" % (row["feature_id"], sentence.language.id)]
                DBSession.add(common.ValueSentence(sentence=sentence, value=value))
            except KeyError:
                missing[(row["feature_id"], sentence.language.id)] = 1

    print(len(missing), "missing datapoints for example_feature relations")
コード例 #55
0
ファイル: initializedb.py プロジェクト: clld/waab
def main(args):
    citations.main(args)
    data = Data()

    pairs = {}
    languages = {}

    coords = {}
    for lang in dsv.rows(
        args.data_file('MB_Map_Data_Aug13WLabels'),
        namedtuples=True,
        newline='\n',
        encoding='latin1'
    ):
        coords[slug(lang.Label.split('<')[0].strip())] = (
            float(lang.y), float(lang.x))

    xls = xlrd.open_workbook(args.data_file('MB_BoCatSum_AFBO.xlsx'))
    matrix = xls.sheet_by_name('MB_BoCatSum_AFBO.txt')
    md = "area\trecipient language iso\trecipient language genus\tdonor language iso\tdonor language genus".split('\t')

    fields = []
    params = []
    for i in range(matrix.ncols):
        colname = xlrd.colname(i)
        if len(colname) == 2 and colname > 'BE':
            break
        colval = matrix.cell(0, i).value.strip()
        if (len(colname) == 1 and colname > 'G') or (len(colname) == 2 and colname < 'AY'):
            params.append(colval)
            fields.append(colval)
        else:
            fields.append(colval.lower())

    for f in fields:
        if fields.count(f) > 1:
            print(f)

    assert len(fields) == len(set(fields))

    for j in range(1, matrix.nrows):
        values = dict(zip(fields, [matrix.cell(j, i).value for i in range(matrix.ncols)]))
        try:
            id_ = int(values['perm.id'])
        except:
            continue

        pairs[id_] = values
        for type_ in ['recipient', 'donor']:
            languages[values[type_ + ' language'].strip()] = {
                'macroarea': values['area']}
            for md in ['iso', 'genus']:
                languages[values[type_ + ' language'].strip()][md] \
                    = values['%s language %s' % (type_, md)]

    for name in COORDS:
        assert name in languages

    sources = {}
    with open(args.data_file('MB_Case_List_with_links.html')) as fp:
        worddoc = fp.read()
        for m in re.finditer('\"__(?P<recid>[^_]+)__\"', worddoc):
            sources[m.group('recid').decode('utf8')] = 1
        soup = bs(worddoc)

    doc = {}
    cols = []
    table = soup.find('table')
    for tr in table.children:
        if tr.name != 'tr':
            continue
        tds = filter(lambda n: n.name == 'td', tr.children)
        if not cols:
            cols = map(text, tds)
        else:
            values = dict(zip(cols, tds))
        try:
            id_ = int(text(values['perm.id']))
            doc[id_] = values
            if id_ in pairs:
                assert doc['Recipient lg.'] == pairs[id_][1]['recipient language']
                assert doc['Don'] == pairs[id_][1]['donor language']
        except:
            continue

    dataset = common.Dataset(
        id='afbo',
        name="AfBo: A world-wide survey of affix borrowing",
        contact="*****@*****.**",
        domain="afbo.info",
        license='http://creativecommons.org/licenses/by/3.0/',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 3.0 Unported License'})

    DBSession.add(dataset)
    for i, spec in enumerate([('seifart', "Frank Seifart")]):
        DBSession.add(common.Editor(
            dataset=dataset,
            ord=i + 1,
            contributor=common.Contributor(id=spec[0], name=spec[1])))

    contrib = data.add(common.Contribution, 'afbo', name="AfBo", id="afbo")

    iso_map = {
        ('ron', 'Meglenite Romanian'): ('ruq', None),
        ('fra', 'Norman French'): ('xno', None),
        ('tur', 'Turkic'): (None, 'turk1311'),
        ('xuu', 'Kxoe languages'): (None, 'khoe1241'),
        ('zoc', 'Zoquean languages'): (None, 'zoqu1261'),
        ('tzm', 'Moroccan Berber languages'): (None, 'atla1275'),
        ('cvn', 'Quechua'): ('qvn', None),
        ('rop', 'Gurindji Kriol'): (None, 'guri1249'),
        ('ita', 'Sicilian Italian'): ('scn', None),
        ('srp', 'Croatian'): ('hrv', None),
        ('eme', 'Wayampi‑Emerillon‑Zo’é'): (None, 'waya1271'),
        ('ale', 'Copper Island Aleut'): ('mud', None),
        ('car', 'intermediate Proto‑Carib'): (None, 'cari1283'),
        ('ell', 'Cappadocian Greek'): ('cpg', None),
        ('eng', 'Middle English'): ('enm', None),
        ('als', 'Arvanitic Albanian'): ('aat', None),
        ('nys', 'Northern Nyungic'): (None, 'dese1234'),
        ('ron', 'Istro‑Romanian'): ('ruo', None),
        ('chf', 'Cho’ol'): ('ctu', None),
        ('tuo', 'Eastern Tucanoan languages'): (None, 'east2698'),
        ('ceb', 'Visayan'): (None, 'bisa1268'),
        ('por', 'Sri Lanka Portuguese'): (None, 'mala1544'),
        ('brx', 'Tibeto-Burman languages'): (None, 'brah1260'),
    }

    with open('name_conflicts.tab', 'w') as fp:
        fp.write('iso\tafbo\tglottolog\tproposed iso\n')
        for i, name in enumerate(languages.keys()):
            md = languages[name]
            iso = md.pop('iso')
            if iso == 'cvn' and name == 'Quechua':
                iso = 'qvn'
            kw = dict(name=name, id=str(i+1), jsondata=md)
            if name in COORDS:
                kw['latitude'], kw['longitude'] = COORDS[name]
            elif slug(name) in coords:
                kw['latitude'], kw['longitude'] = coords[slug(name)]
            elif glottocoords.get(iso):
                kw['latitude'], kw['longitude'] = glottocoords[iso]

            if glottonames.get(iso) and slug(glottonames.get(iso)) != slug(name):
                fp.write(('%s\t%s\t%s\t%s\n' % (
                    iso, name, glottonames.get(iso), rglottonames.get(slug(name), ''))).encode('utf8'))

            if name == 'Meglenite Romanian':
                kw['name'] = 'Megleno Romanian'
            if not 'latitude' in kw:
                print(name)
            l = data.add(common.Language, name, **kw)

            iso, gc = iso_map.get((iso, name), (iso, None))

            for code, type_ in [
                (iso, common.IdentifierType.iso),
                (gc or glottocodes.get(iso), common.IdentifierType.glottolog)
            ]:
                if code:
                    identifier = data.add(
                        common.Identifier, code, id=code, name=code, type=type_.value)
                    data.add(
                        common.LanguageIdentifier, '%s-%s' % (code, l.id),
                        identifier=identifier, language=l)

    include = sources.keys() + [
        'myersscottoncontact2002', 'myersscottonlanguage2007',
        'meakinsborrowing2011', 'seifartprinciple2012',
    ]
    refdb = bibtex.Database.from_file(args.data_file('FSeifartZoteroLibrary14Nov2013.bib'))
    for rec in refdb:
        if slug(rec.id) in include:
            data.add(common.Source, slug(rec.id), _obj=bibtex2source(rec))

    for i, name in enumerate(params):
        data.add(models.AffixFunction, name, id=str(i + 1), name=name)

    for id_, vd in pairs.items():
        assert id_ in doc

        donor = data['Language'][vd['donor language'].strip()]
        recipient = data['Language'][vd['recipient language'].strip()]

        p = data.add(
            models.Pair,
            id_,
            id=str(id_),
            name=vd['pairs'].replace('Meglenite', 'Megleno'),
            area=recipient.jsondata['macroarea'],
            description=unicode(doc[id_]['comment']).replace('<h1', '<p').replace('</h1>', '</p>').replace('Meglenite', 'Megleno'),
            reliability=vd['reliability'],
            int_reliability=['high', 'mid', 'low'].index(vd['reliability']),
            count_interrel=int(vd[u'number of interrelated affixes']),
            count_borrowed=int(vd['number of borrowed affixes']),
            donor=donor,
            recipient=recipient)
        DBSession.flush()

        for i, param in enumerate(params):
            param_id = i + 1
            value = vd[param]
            if value != '':
                vsid = '%s-%s' % (recipient.id, param_id)
                if vsid in data['ValueSet']:
                    vs = data['ValueSet'][vsid]
                else:
                    vs = data.add(
                        common.ValueSet, vsid,
                        id=vsid,
                        parameter=data['AffixFunction'][param],
                        language=recipient,
                        contribution=contrib)
                data.add(
                    models.waabValue,
                    '%s-%s' % (id_, param_id),
                    id='%s-%s' % (id_, param_id),
                    pair=p,
                    name='%s' % int(value),
                    numeric=int(value),
                    description='%s' % p,
                    valueset=vs)
コード例 #56
0
ファイル: initializedb.py プロジェクト: Anaphory/culturebank
def prime_cache(args):
    """If data needs to be denormalized for lookup, do that here.
    This procedure should be separate from the db initialization, because
    it will have to be run periodically whenever data has been updated.
    """
    compute_language_sources()
    return 
    from time import time
    _s = time()

    def checkpoint(s, msg=None):
        n = time()
        print(n - s, msg or '')
        return n

    sql = """
select p.id, l.id, v.name from value as v, valueset as vs, language as l, parameter as p
where v.valueset_pk = vs.pk and vs.language_pk = l.pk and vs.parameter_pk = p.pk
    """
    datatriples = [(v[0], v[1], v[2]) for v in DBSession.execute(sql)]
    _s = checkpoint(_s, '%s values loaded' % len(datatriples))

    flv = dict([(feature, dict(lvs)) for (feature, lvs) in grp(datatriples).items()])
    _s = checkpoint(_s, 'triples grouped')

    clfps = list(get_clf_paths([row[0] for row in DBSession.execute("select id from language")]))
    _s = checkpoint(_s, '%s clfps loaded' % len(clfps))

    features = {f.id: f for f in DBSession.query(Feature)}
    for (f, lv) in flv.items():
        features[f].representation = len(lv)
    DBSession.flush()
    _s = checkpoint(_s, 'representation assigned')

    families = {f.id: f for f in DBSession.query(Family)}
    if False:
        fs = feature_stability(datatriples, clfps)
        _s = checkpoint(_s, 'feature_stability computed')

        for (f, (s, transitions, stationarity_p, synchronic_p)) in fs:
            print(f)
            stability = Stability(
                id=f.replace("GB", "S"),
                feature=features[f],
                parsimony_stability_value=s["stability"],
                parsimony_retentions=s["retentions"],
                parsimony_transitions=s["transitions"],
                jsondata={'diachronic_p': stationarity_p, "synchronic_p": synchronic_p})
            DBSession.add(stability)
            for (i, (fam, (fromnode, tonode), (ft, tt))) in enumerate(transitions):
                DBSession.add(Transition(
                    id="%s: %s->%s" % (f, fromnode, tonode),
                    stability=stability,
                    fromnode=get_name(fromnode),
                    tonode=get_name(tonode),
                    fromvalue=ft,
                    tovalue=tt,
                    family=families[fam],
                    retention_innovation="Retention" if ft == tt else "Innovation"))
        DBSession.flush()
        _s = checkpoint(_s, 'stability and transitions loaded')

    imps = feature_dependencies(datatriples)
    _s = checkpoint(_s, 'feature_dependencies computed')
    if True:
        (H, V) = dependencies_graph([(v, f1, f2) for ((v, dstats), f1, f2) in imps])
        _s = checkpoint(_s, 'dependencies_graph written')

        for (i, ((v, dstats), f1, f2)) in enumerate(imps):
            combinatory_status = ("primary" if (f1, f2) in H else ("epiphenomenal" if v > 0.0 else None)) if H else "N/A"
            DBSession.add(Dependency(
                id="%s->%s" % (f1, f2),
                strength=v,
                feature1=features[f1],
                feature2=features[f2],
                representation=dstats["representation"],
                combinatory_status=combinatory_status,
                jsondata=dstats))
        DBSession.flush()
        _s = checkpoint(_s, 'dependencies loaded')

    coordinates = {
        lg.id: (lg.longitude, lg.latitude)
        for lg in DBSession.query(common.Language)
        .filter(common.Language.longitude != None)
        .filter(common.Language.latitude != None)}
    deepfams = deep_families(datatriples, clfps, coordinates=coordinates)
    _s = checkpoint(_s, '%s deep_families computed' % len(deepfams))

    missing_families = set()
    data = Data()
    for ((l1, l2), support_value, significance, supports, f1c, f2c) in deepfams:
        dname = "proto-%s x proto-%s" % (glottolog_names[l1], glottolog_names[l2])
        kmdistance = havdist(f1c, f2c)
        (f1lon, f1lat) = f1c if f1c else (None, None)
        (f2lon, f2lat) = f2c if f2c else (None, None)

        for li in [l1, l2]:
            if li not in families:
                missing_families.add(li)

        deepfam = DeepFamily(
            id=dname,
            support_value=support_value,
            significance=significance,
            family1=families.get(l1),
            family2=families.get(l2),
            family1_latitude = f1lat,
            family1_longitude = f1lon,
            family2_latitude = f2lat,
            family2_longitude = f2lon,
            geographic_plausibility = kmdistance)
        DBSession.add(deepfam)
        for (f, v1, v2, historical_score, independent_score, support_score) in supports:
            vid = ("%s: %s %s %s" % (f, v1, "==" if v1 == v2 else "!=", v2)).replace(".", "")
            #vname = ("%s|%s" % (v1, v2)).replace(".", "")
            #print vid, vname
            if vid not in data["Support"]:
                data.add(
                    Support, vid,
                    id = vid,
                    historical_score = historical_score,
                    independent_score = independent_score,
                    support_score = support_score,
                    value1= v1,
                    value2 = v2,
                    feature=features[f])
            DBSession.add(HasSupport(
                id=dname + "-" + vid,
                deepfamily = deepfam,
                support = data["Support"][vid]))
    print('missing_families:')
    print(missing_families)
    DBSession.flush()
    _s = checkpoint(_s, 'deep_families loaded')

    compute_language_sources()
コード例 #57
0
ファイル: import_tree.py プロジェクト: kublaj/glottolog3
def main(args):  # pragma: no cover
    global MAX_IDENTIFIER_PK

    with transaction.manager:
        MAX_IDENTIFIER_PK = DBSession.query(
            Identifier.pk).order_by(desc(Identifier.pk)).first()[0]

        gc_names = {i.name: i for i in DBSession.query(Identifier).filter(
            Identifier.type == 'name').filter(Identifier.description == 'Glottolog')}

        ma_map = get_map(Macroarea)
        languoids = dict((l.pk, l) for l in DBSession.query(Languoid))
        with open(args.data_file(args.version, 'languoids.json')) as fp:
            for attrs in json.load(fp):
                ma = attrs.pop('macroarea', None)
                replacement = attrs.pop('replacement', None)
                hname = attrs.pop('hname', None)

                for name, enum in [('level', LanguoidLevel), ('status', LanguoidStatus)]:
                    if name in attrs:
                        attrs[name] = enum.from_string(attrs[name])

                l = languoids.get(attrs['pk'])
                if l:
                    for k, v in attrs.items():
                        if k == 'globalclassificationcomment':
                            continue
                        setattr(l, k, v)
                    if len(l.hid or '') == 3:
                        if not l.iso_code:
                            create_identifier(
                                None, l, name=l.hid, type=IdentifierType.iso.value)
                else:
                    l = Languoid(**attrs)
                    DBSession.add(l)
                    languoids[l.pk] = l

                    if len(attrs.get('hid', '')) == 3:
                        create_identifier(
                            None, l, name=attrs['hid'], type=IdentifierType.iso.value)
                    if ma:
                        l.macroareas.append(ma_map[ma])

                    create_identifier(
                        gc_names.get(l.name),
                        l,
                        name=l.name,
                        description='Glottolog',
                        type='name')

                if hname:
                    l.update_jsondata(hname=hname)

                if replacement:
                    DBSession.add(Superseded(
                        languoid_pk=l.pk,
                        replacement_pk=replacement,
                        relation='classification update'))

                DBSession.flush()

        recreate_treeclosure()
コード例 #58
0
def populate_test_db(engine):
    set_alembic_version(engine, '58559d4eea0d')

    data = TestData()
    data.add_default(
        cm.Dataset,
        domain='clld',
        jsondata={'license_icon': 'cc-by', 'license_url': 'http://example.org'})

    data.add_default(cm.Contributor, name='A Name', email='*****@*****.**')
    for id_, name in {
            'b': 'b Name',
            'c': 'c Name',
            'd': 'd Name'}.items():
        data.add(cm.Contributor, id_, id=id_, name=name, url='http://example.org')

    DBSession.add(
        cm.Editor(dataset=data[cm.Dataset], contributor=data[cm.Contributor]))

    data.add_default(cm.Source)
    data.add(
        cm.Source,
        'replaced',
        id='replaced',
        active=False,
        jsondata={'__replacement_id__': 'source'})

    data.add_default(cm.Contribution)
    cm.ContributionReference(contribution=data[cm.Contribution], source=data[cm.Source])

    for primary, c in [(True, 'contributor'), (False, 'b'), (True, 'c'), (False, 'd')]:
        cm.ContributionContributor(
            contribution=data[cm.Contribution],
            primary=primary,
            contributor=data['Contributor'][c])

    data.add_default(cm.Language, latitude=10.5, longitude=0.3)
    data[cm.Language].sources.append(data[cm.Source])

    for i, type_ in enumerate(cm.IdentifierType):
        cm.LanguageIdentifier(
            language=data[cm.Language],
            identifier=cm.Identifier(type=type_.value, id=type_.value + str(i), name='a'))

    cm.LanguageIdentifier(
        language=data[cm.Language],
        identifier=cm.Identifier(type='name', id='name', name='a'))

    for i in range(2, 102):
        _l = cm.Language(id='l%s' % i, name='Language %s' % i)
        _i = cm.Identifier(type='iso639-3', id='%.3i' % i, name='%.3i' % i)
        cm.LanguageIdentifier(language=_l, identifier=_i)
        DBSession.add(_l)

    param = data.add_default(cm.Parameter)
    de = cm.DomainElement(id='de', name='DomainElement', parameter=param)
    de2 = cm.DomainElement(id='de2', name='DomainElement2', parameter=param)

    valueset = data.add_default(
        cm.ValueSet,
        language=data[cm.Language],
        parameter=param,
        contribution=data[cm.Contribution])
    cm.ValueSetReference(valueset=valueset, source=data[cm.Source], description='10-20')

    data.add_default(
        cm.Value,
        domainelement=de,
        valueset=valueset,
        frequency=50,
        confidence='high')
    data.add(
        cm.Value, 'value2',
        id='value2',
        domainelement=de2,
        valueset=valueset,
        frequency=50,
        confidence='high')

    paramnd = data.add(
        cm.Parameter,
        'no-domain',
        id='no-domain',
        name='Parameter without domain')
    valueset = cm.ValueSet(
        id='vs2',
        language=data[cm.Language],
        parameter=paramnd,
        contribution=data[cm.Contribution])

    cm.ValueSetReference(valueset=valueset, source=data[cm.Source], description='10-20')
    cm.Value(id='v2', valueset=valueset, frequency=50, confidence='high')

    unit = data.add_default(cm.Unit, language=data[cm.Language])
    up = data.add_default(cm.UnitParameter)
    cm.UnitValue(
        id='unitvalue', name='UnitValue', unit=unit, unitparameter=up)

    up2 = cm.UnitParameter(id='up2', name='UnitParameter with domain')
    de = cm.UnitDomainElement(id='de', name='de', parameter=up2)
    DBSession.add(cm.UnitValue(
        id='uv2',
        name='UnitValue2',
        unit=unit,
        unitparameter=up2,
        unitdomainelement=de))

    DBSession.add(cm.Source(id='s'))

    sentence = data.add_default(
        cm.Sentence,
        description='sentence description',
        analyzed='a\tmorpheme\tdoes\tdo',
        gloss='a\tmorpheme\t1SG\tdo.SG2',
        source='own',
        comment='comment',
        original_script='a morpheme',
        language=data[cm.Language],
        jsondata={'alt_translation': 'Spanish: ...'})
    cm.SentenceReference(sentence=sentence, source=data[cm.Source])
    DBSession.add(cm.Config(key='key', value='value'))

    cm.Config.add_replacement('replaced', 'language', model=cm.Language)
    cm.Config.add_replacement('gone', None, model=cm.Language)
    DBSession.flush()
コード例 #59
0
ファイル: util.py プロジェクト: FieldDB/clld
    def setUp(self):
        TestWithDb.setUp(self)

        DBSession.add(common.Dataset(
            id='dataset',
            name='dataset',
            description='desc',
            domain='clld',
            jsondata={'license_icon': 'cc-by'}))

        DBSession.add(common.Source(
            id='replaced', active=False, jsondata={'__replacement_id__': 'source'}))
        source = common.Source(id='source')
        contributors = {
            'contributor': 'A Name',
            'b': 'b Name',
            'c': 'c Name',
            'd': 'd Name'}
        for id_, name in contributors.items():
            contributors[id_] = common.Contributor(
                id=id_, name=name, url='http://example.org')

        contribution = common.Contribution(id='contribution', name='Contribution')
        common.ContributionReference(contribution=contribution, source=source)
        assert common.ContributionContributor(
            contribution=contribution,
            primary=True,
            contributor=contributors['contributor'])
        assert common.ContributionContributor(
            contribution=contribution, primary=False, contributor=contributors['b'])
        assert common.ContributionContributor(
            contribution=contribution, primary=True, contributor=contributors['c'])
        assert common.ContributionContributor(
            contribution=contribution, primary=False, contributor=contributors['d'])

        DBSession.add(contribution)

        language = common.Language(
            id='language', name='Language 1', latitude=10.5, longitude=0.3)
        language.sources.append(source)
        for i, type_ in enumerate(common.IdentifierType):
            id_ = common.Identifier(type=type_.value, id=type_.value + str(i), name='abc')
            common.LanguageIdentifier(language=language, identifier=id_)

        for i in range(2, 102):
            _l = common.Language(id='l%s' % i, name='Language %s' % i)
            _i = common.Identifier(type='iso639-3', id='%.3i' % i, name='%.3i' % i)
            common.LanguageIdentifier(language=_l, identifier=_i)
            DBSession.add(_l)

        param = common.Parameter(id='parameter', name='Parameter')
        de = common.DomainElement(id='de', name='DomainElement', parameter=param)
        de2 = common.DomainElement(id='de2', name='DomainElement2', parameter=param)
        valueset = common.ValueSet(
            id='valueset', language=language, parameter=param, contribution=contribution)
        value = common.Value(
            id='value',
            domainelement=de,
            valueset=valueset,
            frequency=50,
            confidence='high')
        DBSession.add(value)
        value2 = common.Value(
            id='value2',
            domainelement=de2,
            valueset=valueset,
            frequency=50,
            confidence='high')
        DBSession.add(value2)
        paramnd = common.Parameter(id='no-domain', name='Parameter without domain')
        valueset = common.ValueSet(
            id='vs2', language=language, parameter=paramnd, contribution=contribution)
        common.ValueSetReference(valueset=valueset, source=source)
        value = common.Value(id='v2', valueset=valueset, frequency=50, confidence='high')
        DBSession.add(value)

        unit = common.Unit(id='unit', name='Unit', language=language)
        up = common.UnitParameter(id='unitparameter', name='UnitParameter')
        DBSession.add(unit)
        DBSession.add(common.UnitValue(
            id='unitvalue', name='UnitValue', unit=unit, unitparameter=up))

        up2 = common.UnitParameter(id='up2', name='UnitParameter with domain')
        de = common.UnitDomainElement(id='de', name='de', parameter=up2)
        DBSession.add(common.UnitValue(
            id='uv2',
            name='UnitValue2',
            unit=unit,
            unitparameter=up2,
            unitdomainelement=de))

        DBSession.add(common.Source(id='s'))

        sentence = common.Sentence(
            id='sentence',
            name='sentence name',
            description='sentence description',
            analyzed='a\tmorpheme\tdoes\tdo',
            gloss='a\tmorpheme\t1SG\tdo.SG2',
            source='own',
            comment='comment',
            original_script='a morpheme',
            language=language,
            jsondata={'alt_translation': 'Spanish: ...'})
        common.SentenceReference(sentence=sentence, source=source)
        DBSession.add(common.Config(key='key', value='value'))

        common.Config.add_replacement('replaced', 'language', model=common.Language)
        common.Config.add_replacement('gone', None, model=common.Language)
        DBSession.flush()