def load_examples(self, dictionary, data, lang): abbr_p = re.compile('\$(?P<abbr>[a-z1-3][a-z]*(\.[a-z]+)?)') for i, ex in enumerate( Examples.from_file(self.dir.joinpath('processed', 'examples.sfm'))): obj = data.add( models.Example, ex.id, id='%s-%s' % (self.id, ex.id.replace('.', '_')), name=ex.text, number='{0}'.format(i + 1), source=ex.corpus_ref, language=lang, serialized='{0}'.format(ex), dictionary=dictionary, analyzed=ex.morphemes, gloss=abbr_p.sub(lambda m: m.group('abbr').upper(), ex.gloss) if ex.gloss else ex.gloss, description=ex.translation, alt_translation1=ex.alt_translation, alt_translation_language1=self.props.get('metalanguages', {}).get('gxx'), alt_translation2=ex.alt_translation2, alt_translation_language2=self.props.get('metalanguages', {}).get('gxy')) DBSession.flush() if ex.soundfile: self.add_file('audio', ex.soundfile, common.Sentence_files, obj)
def setUp(self): TestWithDb.setUp(self) DBSession.add(common.Dataset(id='d', name='test', domain='localhost')) family = Family(id='f', name='family', description='desc') DBSession.add(LanguageWithFamily(id='l1', family=family)) DBSession.add(LanguageWithFamily(id='l2')) DBSession.flush()
def setUp(self): super(_TestWithDb, self).setUp() DBSession.add(common.Dataset(id='d', name='test', domain='localhost')) family = Family(id='f', name='family', description='desc', jsondata=dict(icon=1)) DBSession.add(LanguageWithFamily(id='l1', family=family)) DBSession.add(LanguageWithFamily(id='l2')) DBSession.flush()
def create_languoid(row, father_pk=None): glottocode = {'akun1242': 'akun1241'}.get(row['alnumcode'], row['alnumcode']) attrs = dict( pk=row['id'], id=glottocode, name=row['primaryname'], description=row['globalclassificationcomment'], level=getattr(models2.LanguoidLevel, row['level']), status=getattr(models2.LanguoidStatus, (row['status'] or '').replace(' ', '_'), None), father_pk=father_pk, created=row['updated'], jsondata={} if not row['hname'] else {'hname': row['hname']}, ) for attr in [ 'active', 'updated', 'hid', 'latitude', 'longitude', ]: attrs[attr] = row[attr] l = data.add(models2.Languoid, row['id'], **attrs) for type_ in params: id_ = '%s%s' % (type_, row['id']) vs = data.add( common.ValueSet, id_, id=id_, description=row['classificationcomment'] if type_ == 'fc' else row['subclassificationcomment'], language=l, parameter=params[type_], contribution=contrib) data.add(common.Value, id_, id=id_, name='%s - %s' % (row['level'], row['status']), valueset=vs) DBSession.flush() valuesets[id_] = vs.pk return str(row['id'])
def put_languoid(request): glottocode = request.matchdict['glottocode'] languoid = query_languoid(DBSession, glottocode) if languoid is None: request.response.status = 404 return {'error': 'Not a valid languoid ID'} json_data = request.json_body try: data, errors = LanguoidSchema(partial=True).load(json_data) except ValueError: request.response.status = 400 return {'error': 'Not a valid languoid level'} if errors: request.response.status = 400 return {'error': errors} try: for key, value in data.items(): setattr(languoid, key, value) DBSession.flush() except exc.SQLAlchemyError as e: request.response.status = 400 DBSession.rollback() return {'error': "{}".format(e)} return LanguoidSchema().dump(languoid).data
def main(args): data = Data() dataset = common.Dataset( id=u'An Crúbadán', name=u'An Crúbadán', publisher_name="Saint Louis University", publisher_place="Saint Louis, USA", publisher_url="http://www.slu.edu/", description= "Linguistic datasets for over 2000 languages created from web-crawled text corpora", contact="*****@*****.**", license='http://creativecommons.org/licenses/by/4.0/', jsondata={ 'license_icon': 'https://licensebuttons.net/l/by/4.0/88x31.png', 'license_name': 'Creative Commons Attribution 4.0 International License', }, domain='crubadan.org', ) DBSession.add(dataset) DBSession.flush() editor = data.add(common.Contributor, "Kevin Scannell", id="Kevin Scannell", name="Kevin Scannell", email="*****@*****.**") common.Editor(dataset=dataset, contributor=editor, ord=0) DBSession.flush() fillTable(DBSession)
def test_CustomModelMixin(self): from clld.tests.fixtures import CustomLanguage DBSession.add(CustomLanguage(id='abc', name='Name', custom='c')) DBSession.flush() for lang in DBSession.query(Language).filter(Language.id == 'abc'): self.assertEqual(lang.custom, 'c') break
def test_Dataset(self): from clld import RESOURCES from clld.db.models.common import Dataset, Source d = Dataset(id='abc', domain='test') DBSession.add(d) DBSession.flush() d.get_stats(RESOURCES, source=Source.id == None)
def main(args): # pragma: no cover global MAX_IDENTIFIER_PK with transaction.manager: MAX_IDENTIFIER_PK = DBSession.query( Identifier.pk).order_by(desc(Identifier.pk)).first()[0] gl_name = glottolog_name() gl_names = glottolog_names() languoids = {l.pk: l for l in DBSession.query(Languoid)} for attrs in jsonload(args.data_dir.joinpath('languoids', 'changes.json')): replacement = attrs.pop('replacement', None) hname = attrs.pop('hname', None) for name, enum in [('level', LanguoidLevel), ('status', LanguoidStatus)]: if name in attrs: attrs[name] = enum.from_string(attrs[name]) l = languoids.get(attrs['pk']) if l: for k, v in attrs.items(): setattr(l, k, v) # # We do not assign ISO codes for existing languages, because it could be # that the ISO code is now assigned to a family node, due to a change # request, e.g. see https://github.com/clld/glottolog-data/issues/40 # if len(l.hid or '') == 3 and not l.iso_code: args.log.warn('Language with hid %s but no iso code!' % l.hid) else: l = Languoid(**attrs) DBSession.add(l) languoids[l.pk] = l if len(attrs.get('hid', '')) == 3: create_identifier( None, l, name=attrs['hid'], type=IdentifierType.iso.value) create_identifier( gl_names.get(l.name), l, name=l.name, description=gl_name.description, type=gl_name.type) if hname: l.update_jsondata(hname=hname) if replacement: DBSession.add(Superseded( languoid_pk=l.pk, replacement_pk=replacement, relation='classification update')) DBSession.flush() recreate_treeclosure()
def test_Data(self): from clld.db.models.common import Language, Language_data l = Language(id='abc', name='Name') l.data.append(Language_data(key='abstract', value='c')) DBSession.add(l) DBSession.flush() DBSession.refresh(l) self.assertEqual(l.datadict()['abstract'], 'c')
def test_JSONEncodedDict(db): l = Language(id='abc', name='Name', jsondata={'i': 2}) DBSession.add(l) DBSession.flush() DBSession.expunge(l) for lang in DBSession.query(Language).filter(Language.id == 'abc'): assert lang.jsondata['i'] == 2 break
def test_Base_jsondata(db): l = Language(id='abc', name='Name') DBSession.add(l) DBSession.flush() l.update_jsondata(a=1) assert 'a' in l.jsondata l.update_jsondata(b=1) assert 'b' in l.jsondata and 'a' in l.jsondata assert 'b' in l.__json__(None)['jsondata']
def test_Data(db): from clld.db.models.common import Language, Language_data l = Language(id='abc', name='Name') l.data.append(Language_data(key='abstract', value='c')) DBSession.add(l) DBSession.flush() DBSession.refresh(l) assert l.datadict()['abstract'] == 'c'
def migrate(from_, to_, converter): # pragma: no cover for row in DB.execute("select * from %s" % from_): res = converter(row) if not res: continue if isinstance(res, dict): DBSession.add(to_(**res)) else: data.add(to_, res[0], **res[1]) DBSession.flush()
def test_compute_language_sources(self): from clld.db.models.common import Source, Sentence, Language, SentenceReference from clld.db.meta import DBSession from clld.db.util import compute_language_sources s = Sentence(id='sentenced', language=Language(id='newlang')) sr = SentenceReference(sentence=s, source=Source.first()) DBSession.add(sr) DBSession.flush() compute_language_sources()
def test_Base(db): l = Language(id='abc', name='Name') DBSession.add(l) DBSession.flush() DBSession.expunge(l) l = Language.get('abc', session=DBSession) assert l.name == 'Name' Language().__str__() assert repr(l) == "<Language 'abc'>"
def import_features(cldf, contributors): # pragma: no cover """ ? = gray cbbbbbb (is ? mapped? if not then don't worry) 0 = blue c0077bb 1 = red ccc3311 2 = teal c009988 3 = orange cee7733 """ features, codes = {}, {} icons = [ 'cffffff', # 'c0077bb' 'cff0000', # 'ccc3311' 'c0000ff', # 'c009988' 'cffff00', # 'cee7733' ] domains = {} for fid, des in itertools.groupby( sorted(cldf['CodeTable'], key=lambda c: c['Parameter_ID']), lambda c: c['Parameter_ID']): domains[fid] = list(des) + [ dict(ID=fid + '-NA', Name='?', Description='Not known') ] for feature in tqdm(list(cldf['ParameterTable']), desc='loading features'): fid = feature['ID'] f = Feature( id=fid, name=feature['Name'], description=feature['Description'], ) for ord, patron in enumerate(feature['Patrons'], start=1): DBSession.add( FeaturePatron(ord=1, feature=f, contributor_pk=contributors[patron])) for code in domains[fid]: if code['Name'] == '?': icon, number, value = 'tcccccc', 999, None else: icon, number, value = icons[int(code['Name'])], int( code['Name']), code['Name'] DomainElement(id=code['ID'], parameter=f, name=code['Name'], number=number, description=code['Description'], jsondata=dict(icon=icon)) DBSession.add(f) DBSession.flush() features[fid] = f.pk for de in f.domain: codes[de.id] = de.pk return features, codes
def _addSource(lp): """For a lighter 'main' function.""" DBSession.add( common.Source(id=lp[0], name=lp[0], author=lp[2], year=lp[3], title=lp[4], url=lp[5], note=lp[6])) DBSession.flush()
def test_Files(self): from clld.db.models.common import Language, Language_files if PY3: return # pragma: no cover l = Language(id='abc', name='Name') assert l.iso_code is None l._files.append(Language_files(id='abstract')) DBSession.add(l) DBSession.flush() DBSession.refresh(l) f = l.files['abstract']
def test_Files(db, tmppath): from clld.db.models.common import Sentence, Sentence_files l = Sentence(id='abc', name='Name') f = Sentence_files(object=l, id='abstract', mime_type='audio/mpeg') p = f.create(Path(tmppath), 'content') assert Path(p).exists() l._files.append(f) DBSession.add(l) DBSession.flush() DBSession.refresh(l) assert l.files assert l.audio
def main(args): data = Data() data.add( common.Dataset, "starling", id="starling", name="Starling", domain="starling.rinet.ru", description=" The Global Lexicostatistical Database", publisher_name="Russian State University for the Humanities, Moscow") data.add(common.Contribution, "starling", name="Starling", id="starling") def row_to_dict(row_entry): swadesh_id_idx, swadesh_word_idx, form_idx, cognation_idx, notes_idx = range( 0, 5) return { "swadesh_id": row_entry[swadesh_id_idx].value, "swadesh_word": row_entry[swadesh_word_idx].value, "form": row_entry[form_idx].value, "cognation_index": row_entry[cognation_idx].value, "notes": row_entry[notes_idx].value, } data_dir = "./gld/scripts/data/" for path in os.listdir(data_dir): data_file_path = os.path.join(data_dir, path) book = load_workbook(data_file_path) sheet = book.active lang_name = sheet["C1"].value data.add(common.Language, lang_name, id=lang_name, name=lang_name, latitude=52.0, longitude=0.0) fields = [ "swadesh_id", "swadesh_word", "form", "cognation_index", "notes" ] for row in sheet.iter_rows(min_row=2, min_col=1): row_data = row_to_dict(row) w = data.add(Word, "%s_%s" % (row_data["swadesh_id"], row_data["form"]), name=row_data["form"], description="Description", jsondata={k: row_data[k] for k in fields}) w.language = data["Language"][lang_name] DBSession.flush()
def test_Files(db, tmppath): from clld.db.models.common import Sentence, Sentence_files l = Sentence(id='abc', name='Name') f = Sentence_files(object=l, id='abstract', mime_type='audio/mpeg') p = f.create(tmppath, 'content') assert Path(p).exists() l._files.append(f) DBSession.add(l) DBSession.flush() DBSession.refresh(l) assert l.files assert l.audio
def test_CustomModelMixin_polymorphic(self): from clld.tests.fixtures import CustomLanguage lang = Language(id='def', name='Name') clang = CustomLanguage(id='abc', name='Name', custom='c') DBSession.add_all([lang, clang]) DBSession.flush() DBSession.expunge_all() lang = DBSession.query(Language).filter_by(id='def').one() clang = DBSession.query(Language).filter_by(id='abc').one() self.assertEqual(lang.polymorphic_type, 'base') self.assertEqual(clang.polymorphic_type, 'custom') self.assertIs(type(lang), Language) self.assertIs(type(clang), CustomLanguage)
def test_CsvMixin(self): l1 = Language(id='abc', name='Name', latitude=12.4, jsondata=dict(a=None)) DBSession.add(l1) DBSession.flush() l1 = Language.csv_query(DBSession).first() cols = l1.csv_head() row = l1.to_csv() for k, v in zip(cols, row): if k == 'jsondata': self.assertIn('a', json.loads(v)) l2 = Language.from_csv(row) assert_almost_equal(l1.latitude, l2.latitude) row[cols.index('latitude')] = '3,5' l2 = Language.from_csv(row) self.assertLess(l2.latitude, l1.latitude)
def test_CsvMixin(db): l1 = Language(id='abc', name='Name', latitude=12.4, jsondata=dict(a=None)) DBSession.add(l1) DBSession.flush() l1 = Language.csv_query(DBSession).first() cols = l1.csv_head() row = l1.to_csv() for k, v in zip(cols, row): if k == 'jsondata': assert 'a' in json.loads(v) l2 = Language.from_csv(row) assert pytest.approx(l1.latitude) == l2.latitude row[cols.index('latitude')] = '3,5' l2 = Language.from_csv(row) assert l2.latitude < l1.latitude
def main(args): with open(args.data_file('2.3', 'obsolete_refs.json')) as fp: obsolete = json.load(fp) with transaction.manager: provider = Provider.get('glottolog20121') for ref in provider.refs: if ref.id in obsolete: Config.add_replacement(ref, None, session=DBSession, model=Source) DBSession.delete(ref) else: assert len(ref.providers) > 1 DBSession.flush() DBSession.delete(provider)
def add_file(self, type_, checksum, file_cls, obj): if checksum in self.cdstar: jsondata = {k: v for k, v in self.props.get(type_, {}).items()} jsondata.update(self.cdstar[checksum]) f = file_cls( id='%s-%s' % (obj.id, checksum), name=self.cdstar[checksum]['original'], object_pk=obj.pk, mime_type=self.cdstar[checksum]['mimetype'], jsondata=jsondata) DBSession.add(f) DBSession.flush() DBSession.refresh(f) return print('{0} file missing: {1}'.format(type_, checksum)) return
def test_CustomModelMixin_polymorphic(db, custom_language): lang = Language(id='def', name='Name') assert repr(lang).startswith("<Language ") assert is_base(Language) assert not is_base(custom_language) clang = custom_language(id='abc', name='Name', custom='c') DBSession.add_all([lang, clang]) DBSession.flush() DBSession.expunge_all() lang = DBSession.query(Language).filter_by(id='def').one() clang = DBSession.query(Language).filter_by(id='abc').one() assert lang.polymorphic_type == 'base' assert clang.polymorphic_type == 'custom' assert type(lang) is Language assert type(clang) is custom_language
def _addEditor(dataset, count, lp): """For a lighter 'main' function.""" eds = ['Frank Seifart', 'Ludger Paschen', 'Matthew Stave'] ed = dorEditor(id=lp[0], name=lp[0], url=lp[1], email=lp[2], address=lp[3], team=lp[4], function=lp[5]) if lp[0] in eds: common.Editor(dataset=dataset, contributor=ed, ord=count + 1) count += 1 DBSession.add(ed) DBSession.flush() return dataset, count
def test_Files(self): from clld.db.models.common import Sentence, Sentence_files if PY3: return # pragma: no cover l = Sentence(id='abc', name='Name') f = Sentence_files(object=l, id='abstract', mime_type='audio/mpeg') p = f.create(Path(mkdtemp()).joinpath('clldtest').as_posix(), 'content') assert os.path.exists(p) rmtree(Path(p).parent.parent) l._files.append(f) DBSession.add(l) DBSession.flush() DBSession.refresh(l) assert l.files assert l.audio
def _addText(lp): """For a lighter 'main' function and because of checks.""" for a in range(1, len(lp)): if a == 18: if lp[a] == "no": lp[a] = False else: lp[a] = True elif a == 17: if not lp[a] or str(lp[a]).startswith("check"): lp[a] = 0 elif a == 9: genre = lp[9].lower() if genre == "personal narrative": genre = "pers. narr." elif genre == "traditional narrative": genre = "trad. narr." elif genre == "conversation": genre = "convers." elif genre == "stimulus-based": genre = "stimulus" lp[9] = genre elif not lp[a]: lp[a] = 'na' DBSession.add( doreContrib(id=lp[1], tname=lp[2], spks=lp[3], spks_age=lp[4], spks_agec=lp[5], spks_sex=lp[6], recdate=lp[7], recdatec=lp[8], genre=lp[9], subgenre=lp[10], gloss=lp[11], transl=lp[12], sound=lp[13], overlap=lp[14], process=lp[15], NAK=lp[16], glottocode=lp[0], words=lp[17], extended=lp[18])) DBSession.flush()
def put_identifier(request): REQ_FIELDS = ['name', 'type'] OPT_FIELDS = ['description', 'lang'] is_partial = False new_identifier = request.json_body id_query, errors = query_identifier(request.matchdict['type'], request.matchdict['name']) if errors: request.response.status = 404 return {'error': errors} identifier = id_query.first() if not any (k in new_identifier for k in REQ_FIELDS): is_partial = True else: all_fields = REQ_FIELDS + OPT_FIELDS update_fields = (k for k in all_fields if k not in new_identifier) for field in update_fields: new_identifier[field] = getattr(identifier, field) try: data, errors = IdentifierSchema(partial=is_partial).load(new_identifier) except (ValueError, ValidationError) as e: request.response.status = 400 return {'error': '{}'.format(e)} if errors: request.response.status = 400 return {'error': errors} try: for key in new_identifier: # Cannot direct lookup on identifier object setattr(identifier, key, getattr(data, key)) DBSession.flush() result = json.dumps(IdentifierSchema().dump(identifier)) except exc.SQLAlchemyError as e: request.response.status = 400 DBSession.rollback() return { 'error': '{}'.format(e) } # Commit if no errors transaction.commit() return result
def delete_languoid(request): glottocode = request.matchdict['glottocode'] languoid = query_languoid(DBSession, glottocode) if languoid is None: request.response.status = 404 return {'error': 'Not a valid languoid ID'} try: languoid.active = False DBSession.flush() except exc.SQLAlchemyError as e: request.response.status = 400 DBSession.rollback() return {'error': "{}".format(e)} request.response.status = 204 return LanguoidSchema().dump(languoid).data
def prime_cache(args): # pragma: no cover """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodically whenever data has been updated. """ if 1: langs = {l.pk: l for l in DBSession.query(models.GrambankLanguage)} features = {f.pk: f for f in DBSession.query(models.Feature)} for lpk, nf in DBSession.query(common.ValueSet.language_pk, func.count(common.ValueSet.pk)) \ .join(common.Value, common.Value.valueset_pk == common.ValueSet.pk) \ .group_by(common.ValueSet.language_pk): langs[lpk].representation = nf for fpk, nl in DBSession.query(common.ValueSet.parameter_pk, func.count(common.ValueSet.pk))\ .join(common.Value, common.Value.valueset_pk == common.ValueSet.pk)\ .group_by(common.ValueSet.parameter_pk): features[fpk].representation = nl compute_language_sources() get_repos() for obj in DBSession.query(LanguageTreeLabel).all(): DBSession.delete(obj) for obj in DBSession.query(TreeLabel).all(): DBSession.delete(obj) for obj in DBSession.query(Phylogeny).all(): DBSession.delete(obj) DBSession.flush() for tree in tqdm( iter_trees([l.id for l in DBSession.query(common.Language)], Glottolog(REPOS['glottolog']))): nodes = set(n.name for n in tree.traverse()) phylo = Phylogeny(id=tree.name.split('_')[1], name=tree.name, newick=tree.write(format=9)) for l in DBSession.query(common.Language).filter( common.Language.id.in_(nodes)): LanguageTreeLabel(language=l, treelabel=TreeLabel(id=l.id, name=l.id, phylogeny=phylo)) DBSession.add(phylo)
def test_Files(self): from clld.db.models.common import Sentence, Sentence_files if PY3: return # pragma: no cover l = Sentence(id='abc', name='Name') f = Sentence_files(object=l, id='abstract', mime_type='audio/mpeg') p = f.create( Path(mkdtemp()).joinpath('clldtest').as_posix(), 'content') assert os.path.exists(p) rmtree(Path(p).parent.parent) l._files.append(f) DBSession.add(l) DBSession.flush() DBSession.refresh(l) assert l.files assert l.audio
def test_Files(self): from clld.db.models.common import Sentence, Sentence_files from path import path if PY3: return # pragma: no cover l = Sentence(id='abc', name='Name') f = Sentence_files(object=l, id='abstract', mime_type='audio/mpeg') p = f.create(path(gettempdir()), 'content') assert os.path.exists(p) os.remove(p) l._files.append(f) DBSession.add(l) DBSession.flush() DBSession.refresh(l) assert l.files assert l.audio
def add_file(self, type_, checksum, file_cls, obj, attrs=None): if checksum in self.cdstar: jsondata = {k: v for k, v in self.props.get(type_, {}).items()} jsondata.update(self.cdstar[checksum]) if attrs: jsondata.update(attrs) f = file_cls( id='%s-%s' % (obj.id, checksum), name=self.cdstar[checksum]['original'], object_pk=obj.pk, mime_type=self.cdstar[checksum]['mimetype'], jsondata=jsondata) DBSession.add(f) DBSession.flush() DBSession.refresh(f) return print('{0} file missing: {1}'.format(type_, checksum)) return
def test_freeze(self): from clld.scripts.freeze import freeze_func, unfreeze_func tmp = Path(mkdtemp()) tmp.joinpath('data').mkdir() tmp.joinpath('appname').mkdir() class Args(object): env = self.env module_dir = tmp.joinpath('appname').resolve() module = Mock(__name__='appname') def data_file(self, *comps): return tmp.resolve().joinpath('data', *comps) DBSession.flush() args = Args() freeze_func(args, dataset=Dataset.first(), with_history=False) self.assert_(tmp.joinpath('data.zip').exists()) engine = create_engine('sqlite://') Base.metadata.create_all(engine) self.assertEqual( engine.execute('select count(*) from language').fetchone()[0], 0) unfreeze_func(args, engine=engine) s1 = DBSession s2 = sessionmaker(bind=engine)() self.assertEqual( s1.query(Language).count(), s2.query(Language).count()) l1 = s1.query(Language).filter(Language.latitude != null()).first() l2 = s2.query(Language).filter(Language.pk == l1.pk).first() self.assertEqual(l1.created, l2.created) self.assertEqual(l1.latitude, l2.latitude) self.assertEqual(l1.description, l2.description) contrib = s2.query(Contribution).filter( Contribution.id == 'contribution').one() self.assert_(contrib.primary_contributors) self.assert_(contrib.secondary_contributors) rmtree(tmp, ignore_errors=True)
def test_UnitValue(self): from clld.db.models.common import UnitParameter, UnitValue, UnitDomainElement p1 = UnitParameter() p2 = UnitParameter() v = UnitValue( unitdomainelement=UnitDomainElement(parameter=p1, name='ude')) self.assertEqual(str(v), 'ude') DBSession.add(v) DBSession.add(p2) DBSession.flush() try: v.unitparameter_pk = p2.pk raise ValueError # pragma: no cover except AssertionError: pass v.unitparameter_pk = p1.pk DBSession.flush()
def test_freeze(self): from clld.scripts.freeze import freeze_func, unfreeze_func tmp = Path(mkdtemp()) tmp.joinpath('data').mkdir() tmp.joinpath('appname').mkdir() class Args(object): env = self.env module_dir = tmp.joinpath('appname').resolve() module = Mock(__name__='appname') def data_file(self, *comps): return tmp.resolve().joinpath('data', *comps) DBSession.flush() args = Args() freeze_func(args, dataset=Dataset.first(), with_history=False) self.assert_(tmp.joinpath('data.zip').exists()) engine = create_engine('sqlite://') Base.metadata.create_all(engine) self.assertEqual( engine.execute('select count(*) from language').fetchone()[0], 0) unfreeze_func(args, engine=engine) s1 = DBSession s2 = sessionmaker(bind=engine)() self.assertEqual(s1.query(Language).count(), s2.query(Language).count()) l1 = s1.query(Language).filter(Language.latitude != null()).first() l2 = s2.query(Language).filter(Language.pk == l1.pk).first() self.assertEqual(l1.created, l2.created) self.assertEqual(l1.latitude, l2.latitude) self.assertEqual(l1.description, l2.description) contrib = s2.query(Contribution).filter(Contribution.id == 'contribution').one() self.assert_(contrib.primary_contributors) self.assert_(contrib.secondary_contributors) rmtree(tmp, ignore_errors=True)
def test_UnitValue(db): from clld.db.models.common import Unit, Language, UnitParameter, UnitValue, UnitDomainElement u = Unit(name='unit', language=Language(name='language')) p1 = UnitParameter() p2 = UnitParameter() # NOTE: we assume paramter of UnitValue and UnitDomainElement are identical # (i.e. we do not enforce/check this) v = UnitValue( unit=u, unitparameter=p1, unitdomainelement=UnitDomainElement(parameter=p1, name='ude')) assert str(v) == 'ude' DBSession.add(v) DBSession.add(p2) DBSession.flush() try: v.unitparameter_pk = p2.pk raise ValueError # pragma: no cover except AssertionError: pass v.unitparameter_pk = p1.pk DBSession.flush()
def import_cldf(srcdir, md, languoids, conceptsets): with transaction.manager: contrib = Provider( id=srcdir.name, name=md['dc:title'], description=md.get('dc:bibliographicCitation'), url=md.get('dc:identifier'), license=md.get('dc:license'), aboutUrl=md.get('aboutUrl'), ) DBSession.add(contrib) sources = {} cldfdir = srcdir.joinpath('cldf') values = Data() for fname in tqdm(list(cldfdir.glob('*' + MD_SUFFIX)), leave=False): ds = Dataset.from_metadata(fname) for src in ds.sources.items(): if src.id not in sources: sources[src.id] = cldf2clld(src, contrib, len(sources) + 1) import_dataset(ds, contrib, languoids, conceptsets, sources, values) DBSession.flush() # import cognates: if cldfdir.joinpath('cognates.csv').exists(): for csid, cognates in groupby( reader(cldfdir.joinpath('cognates.csv'), dicts=True), lambda i: i['Cognate_set_ID']): cs = Cognateset(id=unique_id(contrib, csid), contribution=contrib) for cognate in cognates: cp = values['Counterpart'].get(cognate['Word_ID']) if cp: DBSession.add(CognatesetCounterpart( cognateset=cs, counterpart=cp, cognate_detection_method=cognate['Cognate_detection_method'], alignment=cognate['Alignment'], alignment_method=cognate['Alignment_method'], doubt=cognate['Doubt'] == 'True'))
def main(args): data = Data() files_dir.rmtree() files_dir.mkdir() editors = OrderedDict() editors['Susanne Maria Michaelis'] = None editors['Philippe Maurer'] = None editors['Martin Haspelmath'] = None editors['Magnus Huber'] = None for row in read('People'): name = row['First name'] + ' ' if row['First name'] else '' name += row['Last name'] kw = dict( name=name, id=slug('%(Last name)s%(First name)s' % row), url=row['Contact Website'].split()[0] if row['Contact Website'] else None, address=row['Comments on database'], ) contrib = data.add(common.Contributor, row['Author ID'], **kw) if kw['name'] in editors: editors[kw['name']] = contrib DBSession.flush() dataset = common.Dataset( id='apics', name='APiCS Online', description='Atlas of Pidgin and Creole Language Structures Online', domain='apics-online.info', published=date(2013, 8, 15), # # TODO: switch license! # license='http://creativecommons.org/licenses/by/3.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 3.0 Unported License'}) DBSession.add(dataset) for i, editor in enumerate(editors.values()): common.Editor(dataset=dataset, contributor=editor, ord=i + 1) colors = dict((row['ID'], row['RGB_code']) for row in read('Colours')) abbrs = {} for id_, name in LGR_ABBRS.items(): DBSession.add(common.GlossAbbreviation(id=id_, name=name)) abbrs[id_] = 1 for id_, name in { 'C**T': 'clitic', 'IMPF': 'imperfect', 'INTERM': 'intermediate', 'NCOMPL': 'noncompletive', 'NONFUT': 'nonfuture', 'NPROX': 'nonproximal', 'NSG': 'nonsingular', 'PP': 'past participle', 'PROP': 'proprietive', 'TMA': 'tense-mood-aspect', }.items(): DBSession.add(common.GlossAbbreviation(id=id_, name=name)) abbrs[id_] = 1 with open(data_dir.joinpath('non-lgr-gloss-abbrs.csv'), 'rb') as csvfile: for row in csv.reader(csvfile): for match in GLOSS_ABBR_PATTERN.finditer(row[1]): if match.group('abbr') not in abbrs: abbrs[match.group('abbr')] = 1 DBSession.add( common.GlossAbbreviation(id=match.group('abbr'), name=row[0])) non_bibs = {} for row in read('References', 'Reference_ID'): if row['Reference_type'] == 'Non-bib': non_bibs[row['Reference_ID']] = row['Reference_name'] continue if isinstance(row['Year'], int): year_int = row['Year'] year = str(row['Year']) elif row['Year']: year_int = None for m in re.finditer('(?P<year>(1|2)[0-9]{3})', row['Year']): year_int = int(m.group('year')) break year = row['Year'] else: year, year_int = None, None title = row['Article_title'] or row['Book_title'] attrs = {} jsondata = {} for attr, field in { 'Additional_information': 'note', 'Article_title': 'title', 'Book_title': 'booktitle', 'City': 'address', 'Editors': 'editor', 'Full_reference': None, 'Issue': None, 'Journal': 'journal', 'Language_codes': None, 'LaTeX_cite_key': None, 'Pages': 'pages', 'Publisher': 'publisher', 'Reference_type': 'type', 'School': 'school', 'Series_title': 'series', 'URL': 'url', 'Volume': 'volume', }.items(): value = row.get(attr) if not isinstance(value, int): value = (value or '').strip() if attr == 'Issue' and value: try: value = str(int(value)) except ValueError: pass if value: if field: attrs[field] = value else: jsondata[attr] = value p = data.add( common.Source, row['Reference_ID'], id=row['Reference_ID'], name=row['Reference_name'], description=title, author=row['Authors'], year=year, year_int=year_int, bibtex_type=getattr(EntryType, row['BibTeX_type'] or 'misc'), jsondata=jsondata, **attrs) if p.bibtex_type.value == 'misc' and not p.description: p.description = p.note DBSession.flush() DBSession.flush() gt = {} p = re.compile('[0-9]+\_(?P<name>[^\_]+)\_(GT|Text)') for d in data_dir.joinpath('gt').files(): m = p.search(unicode(d.basename())) if m: for part in m.group('name').split('&'): # make sure we prefer files named "Text_for_soundfile" if slug(unicode(part)) not in gt or 'Text_for_' in d.basename(): gt[slug(unicode(part))] = d gt_audio = {} p = re.compile('(?P<name>[^\.]+)\.mp3') for d in data_dir.joinpath('gt', 'audio').files(): m = p.search(unicode(d.basename())) assert m for part in m.group('name').split('&'): gt_audio[slug(unicode(part))] = d with open(args.data_file('infobox.json')) as fp: infobox = json.load(fp) for row in read('Languages', 'Order_number'): lon, lat = [float(c.strip()) for c in row['map_coordinates'].split(',')] kw = dict( name=row['Language_name'], id=str(row['Order_number']), latitude=lat, longitude=lon, region=row['Category_region'], #base_language=row['Category_base_language'], ) lect = data.add(models.Lect, row['Language_ID'], **kw) DBSession.flush() for i, item in enumerate(infobox[lect.id]): DBSession.add(common.Language_data( object_pk=lect.pk, ord=i, key=item[0], value=item[1])) if row["Languages_contribution_documentation::Lect_description_checked_status"] == "Checked": desc = row.get('Languages_contribution_documentation::Lect description', '') else: desc = '' c = data.add( models.ApicsContribution, row['Language_ID'], id=row['Order_number'], name=row['Language_name'], description=desc, survey_reference=data['Source'][row['Survey_reference_ID']], language=lect) if slug(row['Language_name']) in gt: f = common.Contribution_files( object=c, id='%s-gt.pdf' % c.id, name='Glossed text', mime_type='application/pdf') f.create(files_dir, file(gt[slug(row['Language_name'])]).read()) else: print '--- no glossed text for:', row['Language_name'] if slug(row['Language_name']) in gt_audio: f = common.Contribution_files( object=c, id='%s-gt.mp3' % c.id, name='Glossed text audio', mime_type='audio/mpeg') f.create(files_dir, file(gt_audio[slug(row['Language_name'])]).read()) else: print '--- no audio for:', row['Language_name'] # # TODO: for michif, 75, add link http://www.youtube.com/watch?v=f0C4cODsSyE # iso = None if row['ISO_code'] and len(row['ISO_code']) == 3: iso = row['ISO_code'].lower() if 'iso:%s' % row['ISO_code'] not in data['Identifier']: data.add( common.Identifier, 'iso:%s' % row['ISO_code'], id=row['ISO_code'].lower(), name=row['ISO_code'].lower(), type='iso639-3') DBSession.add(common.LanguageIdentifier( language=data['Lect'][row['Language_ID']], identifier=data['Identifier']['iso:%s' % row['ISO_code']])) if row['Language_name_ethnologue']: if row['Language_name_ethnologue'] not in data['Identifier']: data.add( common.Identifier, row['Language_name_ethnologue'], id=iso or 'ethnologue:%s' % row['Language_name_ethnologue'], name=row['Language_name_ethnologue'], type='ethnologue') DBSession.add(common.LanguageIdentifier( language=data['Lect'][row['Language_ID']], identifier=data['Identifier'][row['Language_name_ethnologue']])) example_count = {} soundfiles = {} for p in data_dir.joinpath('Soundfiles_Examples').files(): soundfiles[p.namebase] = p for row in read('Examples', 'Order_number'): assert row['Language_ID'] lang = data['Lect'][row['Language_ID']] id_ = '%(Language_ID)s-%(Example_number)s' % row atext, gloss = igt(row) example_count[row['Language_ID']] = max([example_count.get(row['Language_ID'], 1), row['Example_number']]) p = data.add( common.Sentence, id_, id='%s-%s' % (lang.id, row['Example_number']), name=row['Text'] or row['Analyzed_text'], description=row['Translation'], type=row['Type'].strip().lower() if row['Type'] else None, comment=row['Comments'], gloss=gloss, analyzed=atext, markup_text=normalize_markup(row['z_calc_Text_CSS']), markup_gloss=normalize_markup(row['z_calc_Gloss_CSS']), markup_comment=normalize_markup(row['z_calc_Comments_CSS']), markup_analyzed=normalize_markup(row['z_calc_Analyzed_text_CSS']), original_script=row['Original_script'], jsondata={'sort': row['Order_number']}, language=lang) if id_ in soundfiles: #print '---> sound', id_ f = common.Sentence_files( object=p, id='%s.mp3' % p.id, name='Audio', mime_type='audio/mpeg') f.create(files_dir, file(soundfiles[id_]).read()) if row['Reference_ID']: if row['Reference_ID'] in data['Source']: source = data['Source'][row['Reference_ID']] DBSession.add(common.SentenceReference( sentence=p, source=source, key=source.id, description=row['Reference_pages'], )) else: p.source = non_bibs[row['Reference_ID']] DBSession.flush() for row in read('Language_references'): if row['Reference_ID'] not in data['Source']: assert row['Reference_ID'] in non_bibs continue assert row['Language_ID'] in data['ApicsContribution'] source = data['Source'][row['Reference_ID']] DBSession.add(common.ContributionReference( contribution=data['ApicsContribution'][row['Language_ID']], source=source, description=row['Pages'], key=source.id)) # # global counter for features - across feature types # feature_count = 0 for row in read('Features', 'Feature_number'): id_ = str(row['Feature_number']) if int(id_) > feature_count: feature_count = int(id_) wals_id = None if row['WALS_match'] == 'Total': if isinstance(row['WALS_No.'], int): wals_id = row['WALS_No.'] else: wals_id = int(row['WALS_No.'].split('.')[0].strip()) p = data.add( models.Feature, row['Feature_code'], name=row['Feature_name'], id=id_, description=row['Feature_annotation_publication'], markup_description=normalize_markup(row['z_calc_Feature_annotation_publication_CSS']), feature_type='primary', multivalued=row['Value_relation_type'] != 'Single', area=row['Feature_area'], wals_id=wals_id) names = {} for i in range(1, 10): if not row['Value%s_publication' % i] or not row['Value%s_publication' % i].strip(): continue name = row['Value%s_publication' % i].strip() if name in names: name += ' (%s)' % i names[name] = 1 de = data.add( common.DomainElement, '%s-%s' % (row['Feature_code'], i), id='%s-%s' % (id_, i), name=name, parameter=p, abbr=row['Value%s_for_book_maps' % i] if p.id != '0' else name, number=int(row['Value%s_value_number_for_publication' % i]), jsondata={'color': colors[row['Value_%s_colour_ID' % i]]}, ) if row['Authors_FeatureArticles']: authors, _ = row['Authors_FeatureArticles'].split('and the APiCS') authors = authors.strip() if authors.endswith(','): authors = authors[:-1].strip() for i, name in enumerate(authors.split(',')): assert name.strip() in editors p._authors.append(models.FeatureAuthor( ord=i + 1, contributor=editors[name.strip()])) DBSession.flush() primary_to_segment = {123: 63, 126: 35, 128: 45, 130: 41} segment_to_primary = dict(zip( primary_to_segment.values(), primary_to_segment.keys())) number_map = {} names = {} for row in read('Segment_features', 'Order_number'): symbol = row['Segment_symbol'] if row['Segment_name'] == 'voiceless dental/alveolar sibilant affricate': symbol = 't\u0361s' truth = lambda s: s and s.strip().lower() == 'yes' name = '%s - %s' % (symbol, row['Segment_name']) if name in names: number_map[row['Segment_feature_number']] = names[name] continue number_map[row['Segment_feature_number']] = row['Segment_feature_number'] names[name] = row['Segment_feature_number'] feature_count += 1 if row['Segment_feature_number'] in segment_to_primary: primary_to_segment[segment_to_primary[row['Segment_feature_number']]] = str(feature_count) p = data.add( models.Feature, row['Segment_feature_number'], name=name, id=str(feature_count), feature_type='segment', area='Vowels' if truth(row['Vowel']) else ( 'Obstruent consonants' if truth(row['Obstruent']) else 'Sonorant consonants'), jsondata=dict( number=int(row['Segment_feature_number']), vowel=truth(row['Vowel']), consonant=truth(row['Consonant']), obstruent=truth(row['Obstruent']), core_list=truth(row['Core_list_segment']), symbol=symbol, )) for i, spec in SEGMENT_VALUES.items(): data.add( common.DomainElement, '%s-%s' % (row['Segment_feature_number'], spec[0]), id='%s-%s' % (p.id, i), name=spec[0], parameter=p, jsondata={'color': spec[1]}, number=i) print '--> remapped:', primary_to_segment DBSession.flush() for row in read('Sociolinguistic_features', 'Sociolinguistic_feature_number'): feature_count += 1 p = data.add( models.Feature, row['Sociolinguistic_feature_code'], name=row['Sociolinguistic_feature_name'], id='%s' % feature_count, area='Sociolinguistic', feature_type='sociolinguistic') names = {} for i in range(1, 7): id_ = '%s-%s' % (row['Sociolinguistic_feature_code'], i) if row['Value%s' % i] and row['Value%s' % i].strip(): name = row['Value%s' % i].strip() if name in names: name += ' (%s)' % i names[name] = 1 else: name = '%s - %s' % (row['Sociolinguistic_feature_name'], i) kw = dict(id='%s-%s' % (p.id, i), name=name, parameter=p, number=i) de = data.add( common.DomainElement, id_, id='%s-%s' % (p.id, i), name=name, parameter=p, number=i, jsondata={'color': colors.values()[i]}) sd = {} soundfiles = {} for p in data_dir.joinpath('Soundfiles_Segments').files(): soundfiles[p.namebase] = p for row in read('Segment_data'): if row['Segment_feature_number'] not in number_map: continue number = number_map[row['Segment_feature_number']] #Language_ID,Segment_feature_number,Comments,Audio_file_name,Example_word, #Example_word_gloss,Presence_in_the_language,Refers_to_references_Reference_ID if not row['Presence_in_the_language']: continue lang = data['Lect'][row['Language_ID']] param = data['Feature'][number] id_ = '%s-%s' % (lang.id, param.id) if id_ in sd: assert row['c_Record_is_a_duplicate'] == 'Yes' continue sd[id_] = 1 valueset = data.add( common.ValueSet, id_, id=id_, parameter=param, language=lang, contribution=data['ApicsContribution'][row['Language_ID']], description=row['Comments'], markup_description=normalize_markup(row['z_calc_Comments_CSS']), ) v = data.add( common.Value, id_, id=id_, frequency=float(100), valueset=valueset, domainelement=data['DomainElement']['%s-%s' % ( number, row['Presence_in_the_language'])], ) if row['Example_word'] and row['Example_word_gloss']: example_count[row['Language_ID']] += 1 p = data.add( common.Sentence, '%s-p%s' % (lang.id, data['Feature'][number].id), id='%s-%s' % (lang.id, example_count[row['Language_ID']]), name=row['Example_word'], description=row['Example_word_gloss'], language=lang) sid = '%(Language_ID)s-%(Segment_feature_number)s' % row if sid in soundfiles: print '---> sound', sid f = common.Sentence_files( object=p, id='%s.mp3' % p.id, name='Audio', mime_type='audio/mpeg') f.create(files_dir, file(soundfiles[sid]).read()) DBSession.add(common.ValueSentence(value=v, sentence=p)) source = data['Source'].get(row['Refers_to_references_Reference_ID']) if source: DBSession.add(common.ValueSetReference( valueset=valueset, source=source, key=source.id)) elif row['Refers_to_references_Reference_ID'] in non_bibs: valueset.source = non_bibs[row['Refers_to_references_Reference_ID']] lects = defaultdict(lambda: 1) lect_map = {} records = {} false_values = {} no_values = {} wals_value_number = {} for row in read('wals'): if row['z_calc_WALS_value_number']: wals_value_number[row['Data_record_id']] = row['z_calc_WALS_value_number'] def prefix(attr, _prefix): if _prefix: return '%s_%s' % (_prefix, attr) return attr.capitalize() for _prefix, abbr, num_values in [ ('', '', 10), ('Sociolinguistic', 'sl', 7), ]: for row in read(prefix('data', _prefix)): if not row[prefix('feature_code', _prefix)]: print 'no associated feature for', prefix('data', _prefix), row[prefix('data_record_id', _prefix)] continue lid = row['Language_ID'] lect_attr = row.get('Lect_attribute', 'my default lect').lower() if lect_attr != 'my default lect': if (row['Language_ID'], row['Lect_attribute']) in lect_map: lid = lect_map[(row['Language_ID'], row['Lect_attribute'])] else: lang = data['Lect'][row['Language_ID']] c = lects[row['Language_ID']] lid = '%s-%s' % (row['Language_ID'], c) kw = dict( name='%s (%s)' % (lang.name, row['Lect_attribute']), id='%s' % (1000 + 10 * int(lang.id) + c), latitude=lang.latitude, longitude=lang.longitude, description=row['Lect_attribute'], language=lang, ) data.add(models.Lect, lid, **kw) lects[row['Language_ID']] += 1 lect_map[(row['Language_ID'], row['Lect_attribute'])] = lid id_ = abbr + str(row[prefix('data_record_id', _prefix)]) assert id_ not in records records[id_] = 1 assert row[prefix('feature_code', _prefix)] in data['Feature'] #if row[prefix('feature_code', _prefix)] not in data['Feature']: # print row[prefix('feature_code', _prefix)] # print str(row[prefix('data_record_id', _prefix)]) # raise ValueError language = data['Lect'][lid] parameter = data['Feature'][row[prefix('feature_code', _prefix)]] valueset = common.ValueSet( id='%s-%s' % (language.id, parameter.id), description=row['Comments_on_value_assignment'], markup_description=normalize_markup(row.get('z_calc_Comments_on_value_assignment_CSS')), ) values_found = {} for i in range(1, num_values): if not row['Value%s_true_false' % i]: continue if row['Value%s_true_false' % i].strip().lower() != 'true': assert row['Value%s_true_false' % i].strip().lower() == 'false' false_values[row[prefix('data_record_id', _prefix)]] = 1 continue values_found['%s-%s' % (id_, i)] = dict( id='%s-%s' % (valueset.id, i), #valueset=valueset, domainelement=data['DomainElement']['%s-%s' % ( row[prefix('feature_code', _prefix)], i)], confidence=row['Value%s_confidence' % i], frequency=float(row['c_V%s_frequency_normalised' % i]) if _prefix == '' else 100) if values_found: if row[prefix('data_record_id', _prefix)] in wals_value_number: valueset.jsondata = {'wals_value_number': wals_value_number.pop(row[prefix('data_record_id', _prefix)])} valueset.parameter = parameter valueset.language = language valueset.contribution = data['ApicsContribution'][row['Language_ID']] valueset = data.add(common.ValueSet, id_, _obj=valueset) for i, item in enumerate(values_found.items()): if i > 0 and not parameter.multivalued: print 'multiple values for single-valued parameter: %s' % id_ break id_, kw = item kw['valueset'] = valueset value = data.add(common.Value, id_, **kw) # # store references to additional data for segments which should be reused # for corresponding primary features! # if int(parameter.id) in primary_to_segment: assert len(values_found) == 1 seg_id = '%s-%s' % (language.id, primary_to_segment[int(parameter.id)]) seg_valueset = data['ValueSet'][seg_id] seg_value = data['Value'][seg_id] if not valueset.description and seg_valueset.description: valueset.description = seg_valueset.description for s in seg_value.sentence_assocs: DBSession.add(common.ValueSentence(value=value, sentence=s.sentence)) for r in seg_valueset.references: DBSession.add(common.ValueSetReference( valueset=valueset, source=r.source, key=r.key)) if not valueset.source and seg_valueset.source: valueset.source = seg_valueset.source DBSession.flush() else: no_values[id_] = 1 DBSession.flush() for prefix, abbr, num_values in [ ('D', '', 10), ('Sociolinguistic_d', 'sl', 7), ]: for row in read(prefix + 'ata_references'): assert row['Reference_ID'] in data['Source'] or row['Reference_ID'] in non_bibs try: vs = data['ValueSet'][abbr + str(row[prefix + 'ata_record_id'])] if row['Reference_ID'] in data['Source']: source = data['Source'][row['Reference_ID']] DBSession.add(common.ValueSetReference( valueset=vs, source=source, key=source.id, description=row['Pages'], )) else: if vs.source: vs.source += '; ' + non_bibs[row['Reference_ID']] else: vs.source = non_bibs[row['Reference_ID']] except KeyError: print('Reference for unknown dataset: %s' % row[prefix + 'ata_record_id']) continue DBSession.flush() missing = 0 for row in read('Value_examples'): try: DBSession.add(common.ValueSentence( value=data['Value']['%(Data_record_id)s-%(Value_number)s' % row], sentence=data['Sentence']['%(Language_ID)s-%(Example_number)s' % row], description=row['Notes'], )) except KeyError: missing += 1 print('%s Value_examples are missing data' % missing) print('%s data sets with false values' % len(false_values)) print('%s data sets without values' % len(no_values)) for k, v in wals_value_number.items(): print 'unclaimed wals value number:', k, v for i, row in enumerate(read('Contributors')): kw = dict( contribution=data['ApicsContribution'][row['Language ID']], contributor=data['Contributor'][row['Author ID']] ) if row['Order_of_appearance']: kw['ord'] = int(float(row['Order_of_appearance'])) data.add(common.ContributionContributor, i, **kw) DBSession.flush()
def main(args): # pragma: no cover glottocodes = {} for row in GC.execute("select ll.hid, l.id from language as l, languoid as ll where ll.pk = l.pk"): if row[0] and len(row[0]) == 3: glottocodes[row[0]] = row[1] icons = issues.Icons() old_db = DB vs2008 = get_vs2008(args) missing_sources = [] refdb_ids = {} max_id = 7350 with open("/home/robert/venvs/clld/data/wals-data/missing_source.py", "w") as fp: for row in old_db.execute("select * from reference"): try: author, year = row["id"].split("-") except: author, year = None, None bibdata = get_source(row["id"]) if not bibdata: fp.write('"%s",\n' % row["id"]) missing_sources.append(row["id"]) bibdata["pk"] = max_id max_id += 1 if bibdata["pk"] in refdb_ids: print("already seen:", row["id"], "as", refdb_ids[bibdata["pk"]]) data["Source"][row["id"]] = data["Source"][refdb_ids[bibdata["pk"]]] continue refdb_ids[bibdata["pk"]] = row["id"] bibdata.update( { "id": row["id"], "name": row["name"], "description": bibdata.get("title", bibdata.get("booktitle")), "google_book_search_id": row["gbs_id"] or None, } ) data.add(common.Source, row["id"], **bibdata) # # TODO: add additional bibdata as data items # print("sources missing for %s refs" % len(missing_sources)) for id, name in ABBRS.items(): DBSession.add(common.GlossAbbreviation(id=id, name=name)) migrate("country", models.Country, lambda r: (r["id"], dict(id=r["id"], name=r["name"], continent=r["continent"]))) migrate("family", models.Family, lambda r: (r["id"], dict(id=r["id"], name=r["name"], description=r["comment"]))) for row, icon in zip(list(old_db.execute("select * from genus order by family_id")), cycle(iter(icons))): genus = data.add(models.Genus, row["id"], id=row["id"], name=row["name"], icon=icon, subfamily=row["subfamily"]) genus.family = data["Family"][row["family_id"]] DBSession.flush() migrate( "altname", common.Identifier, lambda r: ((r["name"], r["type"]), dict(name=r["name"], type="name", description=r["type"])), ) # names for isolanguages are not unique! enames = {} for r in DB.execute("select * from isolanguage"): id_ = "ethnologue-%s" % r["id"] if r["name"] in enames: data["Identifier"][id_] = enames[r["name"]] else: enames[r["name"]] = data.add( common.Identifier, id_, id=id_, name=r["name"], type="name", description="ethnologue" ) DBSession.flush() migrate( "isolanguage", common.Identifier, lambda r: ( r["id"], dict(id=r["id"], name=r["id"], type=common.IdentifierType.iso.value, description=r["name"]), ), ) migrate( "isolanguage", common.Identifier, lambda r: None if r["id"] not in glottocodes else ( "gc-%s" % r["id"], dict( id="gc-%s" % r["id"], name=glottocodes[r["id"]], type=common.IdentifierType.glottolog.value, description=r["name"], ), ), ) migrate( "language", models.WalsLanguage, lambda r: ( r["id"], dict( id=r["id"], name=r["name"], latitude=r["latitude"], longitude=r["longitude"], ascii_name=r["ascii_name"], genus=data["Genus"][r["genus_id"]], samples_100=r["samples_100"] != 0, samples_200=r["samples_200"] != 0, ), ), ) migrate( "author", common.Contributor, lambda r: (r["id"], dict(name=r["name"], url=r["www"], id=r["id"], description=r["note"])), ) dataset = common.Dataset( id="wals", name="WALS Online", description="The World Atlas of Language Structures Online", domain="wals.info", published=date(2013, 8, 15), contact="*****@*****.**", license="http://creativecommons.org/licenses/by-nc-nd/2.0/de/deed.en", jsondata={ "license_icon": "http://wals.info/static/images/cc_by_nc_nd.png", "license_name": "Creative Commons Attribution-NonCommercial-NoDerivs 2.0 Germany", }, ) DBSession.add(dataset) for i, editor in enumerate(["dryerms", "haspelmathm"]): common.Editor(dataset=dataset, contributor=data["Contributor"][editor], ord=i + 1) migrate( "country_language", models.CountryLanguage, lambda r: dict( language_pk=data["WalsLanguage"][r["language_id"]].pk, country_pk=data["Country"][r["country_id"]].pk ), ) migrate( "altname_language", common.LanguageIdentifier, lambda r: dict( language=data["WalsLanguage"][r["language_id"]], identifier=data["Identifier"][(r["altname_name"], r["altname_type"])], description=r["relation"], ), ) migrate( "isolanguage_language", common.LanguageIdentifier, lambda r: dict( language=data["WalsLanguage"][r["language_id"]], identifier=data["Identifier"][r["isolanguage_id"]], description=r["relation"], ), ) migrate( "isolanguage_language", common.LanguageIdentifier, lambda r: None if "ethnologue-%s" % r["isolanguage_id"] not in data["Identifier"] else dict( language=data["WalsLanguage"][r["language_id"]], identifier=data["Identifier"]["ethnologue-%s" % r["isolanguage_id"]], description=r["relation"], ), ) migrate( "isolanguage_language", common.LanguageIdentifier, lambda r: None if "gc-%s" % r["isolanguage_id"] not in data["Identifier"] else dict( language=data["WalsLanguage"][r["language_id"]], identifier=data["Identifier"]["gc-%s" % r["isolanguage_id"]], description=r["relation"], ), ) migrate( "area", models.Area, lambda r: (r["id"], dict(name=r["name"], dbpedia_url=r["dbpedia_url"], id=str(r["id"]))) ) def migrate_chapter(row): kw = dict( id=row["id"], name=row["name"], wp_slug=row["blog_title"], sortkey=int(row["id"]), area=data["Area"][row["area_id"]], ) if int(row["id"]) in [143, 144]: kw["created"] = E2011 kw["updated"] = E2011 return row["id"], kw migrate("chapter", models.Chapter, migrate_chapter) def migrate_supplement(row): if row["name"] not in ["Help", "Abbreviations"]: sortkey = 990 + int(row["id"]) if row["name"] != "Introduction" else 0 id_ = "s%s" % row["id"] kw = dict(id=id_, name=row["name"], sortkey=sortkey) return id_, kw migrate("supplement", models.Chapter, migrate_supplement) migrate( "chapter_reference", common.ContributionReference, lambda r: dict(contribution=data["Chapter"][r["chapter_id"]], source=data["Source"][r["reference_id"]]), ) migrate( "reference_supplement", common.ContributionReference, lambda r: dict( contribution=data["Chapter"]["s%s" % r["supplement_id"]], source=data["Source"][r["reference_id"]] ), ) def migrate_feature(row): kw = dict(id=row["id"], name=row["name"], ordinal_qualifier=row["id"][-1]) if row["id"].startswith("143") or row["id"].startswith("144"): kw["created"] = E2011 kw["updated"] = E2011 kw["chapter"] = data["Chapter"][row["chapter_id"]] return row["id"], kw migrate("feature", models.Feature, migrate_feature) def migrate_value(row): desc = row["description"] if desc == "SOV & NegV/VNeg": if row["icon_id"] != "s9ff": desc += " (a)" else: desc += " (b)" kw = dict( id="%s-%s" % (row["feature_id"], row["numeric"]), name=desc, description=row["long_description"], jsondata=dict(icon=issues.Icons.id(row["icon_id"])), number=row["numeric"], parameter=data["Feature"][row["feature_id"]], ) return (row["feature_id"], row["numeric"]), kw migrate("value", common.DomainElement, migrate_value) same = 0 added = 0 for row in old_db.execute("select * from datapoint"): parameter = data["Feature"][row["feature_id"]] language = data["WalsLanguage"][row["language_id"]] id_ = "%s-%s" % (parameter.id, language.id) created = E2008 updated = E2008 value_numeric = row["value_numeric"] if (language.id, parameter.id) in vs2008: if vs2008[(language.id, parameter.id)] != row["value_numeric"]: print("~~~", id_, vs2008[(language.id, parameter.id)], "-->", row["value_numeric"]) value_numeric = vs2008[(language.id, parameter.id)] else: same += 1 else: updated = E2011 created = E2011 if parameter.id[-1] == "A" and not (parameter.id.startswith("143") or parameter.id.startswith("144")): added += 1 kw = dict(id=id_, updated=updated, created=created) valueset = data.add( common.ValueSet, row["id"], language=language, parameter=parameter, contribution=parameter.chapter, **kw ) data.add( common.Value, id_, domainelement=data["DomainElement"][(row["feature_id"], value_numeric)], valueset=valueset, **kw ) print(same, "datapoints did not change") print(added, "datapoints added to existing features") DBSession.flush() migrate( "datapoint_reference", common.ValueSetReference, lambda r: dict( valueset=data["ValueSet"][r["datapoint_id"]], source=data["Source"][r["reference_id"]], description=r["note"], ), ) migrate( "author_chapter", common.ContributionContributor, lambda r: dict( ord=r["order"], primary=r["primary"] != 0, contributor_pk=data["Contributor"][r["author_id"]].pk, contribution_pk=data["Chapter"][r["chapter_id"]].pk, ), ) migrate( "author_supplement", common.ContributionContributor, lambda r: dict( ord=r["order"], primary=r["primary"] != 0, contributor_pk=data["Contributor"][r["author_id"]].pk, contribution_pk=data["Chapter"]["s%s" % r["supplement_id"]].pk, ), ) igts = defaultdict(lambda: []) for row in old_db.execute("select * from igt"): d = {"id": "igt-%s" % row["id"]} d.update(parse_igt(row["xhtml"])) igts[row["example_id"]].append(d) for row in old_db.execute("select * from example"): if not row["language_id"]: print("example without language:", row["id"]) continue _igts = igts[row["id"]] if _igts: for igt in _igts: data.add( common.Sentence, igt["id"], markup_comment=row["xhtml"], language=data["WalsLanguage"][row["language_id"]], **igt ) else: name = teaser(row["xhtml"]) if name: data.add( common.Sentence, row["id"], id=str(row["id"]), name=name, xhtml=row["xhtml"], language=data["WalsLanguage"][row["language_id"]], ) missing = {} for row in old_db.execute("select * from example_feature"): _igts = igts[row["example_id"]] if _igts: for igt in _igts: try: sentence = data["Sentence"][igt["id"]] except KeyError: print("missing sentence:", row["example_id"]) continue try: value = data["Value"]["%s-%s" % (row["feature_id"], sentence.language.id)] DBSession.add(common.ValueSentence(sentence=sentence, value=value)) except KeyError: missing[(row["feature_id"], sentence.language.id)] = 1 else: try: sentence = data["Sentence"][row["example_id"]] except KeyError: print("missing sentence:", row["example_id"]) continue try: value = data["Value"]["%s-%s" % (row["feature_id"], sentence.language.id)] DBSession.add(common.ValueSentence(sentence=sentence, value=value)) except KeyError: missing[(row["feature_id"], sentence.language.id)] = 1 print(len(missing), "missing datapoints for example_feature relations")
def main(args): citations.main(args) data = Data() pairs = {} languages = {} coords = {} for lang in dsv.rows( args.data_file('MB_Map_Data_Aug13WLabels'), namedtuples=True, newline='\n', encoding='latin1' ): coords[slug(lang.Label.split('<')[0].strip())] = ( float(lang.y), float(lang.x)) xls = xlrd.open_workbook(args.data_file('MB_BoCatSum_AFBO.xlsx')) matrix = xls.sheet_by_name('MB_BoCatSum_AFBO.txt') md = "area\trecipient language iso\trecipient language genus\tdonor language iso\tdonor language genus".split('\t') fields = [] params = [] for i in range(matrix.ncols): colname = xlrd.colname(i) if len(colname) == 2 and colname > 'BE': break colval = matrix.cell(0, i).value.strip() if (len(colname) == 1 and colname > 'G') or (len(colname) == 2 and colname < 'AY'): params.append(colval) fields.append(colval) else: fields.append(colval.lower()) for f in fields: if fields.count(f) > 1: print(f) assert len(fields) == len(set(fields)) for j in range(1, matrix.nrows): values = dict(zip(fields, [matrix.cell(j, i).value for i in range(matrix.ncols)])) try: id_ = int(values['perm.id']) except: continue pairs[id_] = values for type_ in ['recipient', 'donor']: languages[values[type_ + ' language'].strip()] = { 'macroarea': values['area']} for md in ['iso', 'genus']: languages[values[type_ + ' language'].strip()][md] \ = values['%s language %s' % (type_, md)] for name in COORDS: assert name in languages sources = {} with open(args.data_file('MB_Case_List_with_links.html')) as fp: worddoc = fp.read() for m in re.finditer('\"__(?P<recid>[^_]+)__\"', worddoc): sources[m.group('recid').decode('utf8')] = 1 soup = bs(worddoc) doc = {} cols = [] table = soup.find('table') for tr in table.children: if tr.name != 'tr': continue tds = filter(lambda n: n.name == 'td', tr.children) if not cols: cols = map(text, tds) else: values = dict(zip(cols, tds)) try: id_ = int(text(values['perm.id'])) doc[id_] = values if id_ in pairs: assert doc['Recipient lg.'] == pairs[id_][1]['recipient language'] assert doc['Don'] == pairs[id_][1]['donor language'] except: continue dataset = common.Dataset( id='afbo', name="AfBo: A world-wide survey of affix borrowing", contact="*****@*****.**", domain="afbo.info", license='http://creativecommons.org/licenses/by/3.0/', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 3.0 Unported License'}) DBSession.add(dataset) for i, spec in enumerate([('seifart', "Frank Seifart")]): DBSession.add(common.Editor( dataset=dataset, ord=i + 1, contributor=common.Contributor(id=spec[0], name=spec[1]))) contrib = data.add(common.Contribution, 'afbo', name="AfBo", id="afbo") iso_map = { ('ron', 'Meglenite Romanian'): ('ruq', None), ('fra', 'Norman French'): ('xno', None), ('tur', 'Turkic'): (None, 'turk1311'), ('xuu', 'Kxoe languages'): (None, 'khoe1241'), ('zoc', 'Zoquean languages'): (None, 'zoqu1261'), ('tzm', 'Moroccan Berber languages'): (None, 'atla1275'), ('cvn', 'Quechua'): ('qvn', None), ('rop', 'Gurindji Kriol'): (None, 'guri1249'), ('ita', 'Sicilian Italian'): ('scn', None), ('srp', 'Croatian'): ('hrv', None), ('eme', 'Wayampi‑Emerillon‑Zo’é'): (None, 'waya1271'), ('ale', 'Copper Island Aleut'): ('mud', None), ('car', 'intermediate Proto‑Carib'): (None, 'cari1283'), ('ell', 'Cappadocian Greek'): ('cpg', None), ('eng', 'Middle English'): ('enm', None), ('als', 'Arvanitic Albanian'): ('aat', None), ('nys', 'Northern Nyungic'): (None, 'dese1234'), ('ron', 'Istro‑Romanian'): ('ruo', None), ('chf', 'Cho’ol'): ('ctu', None), ('tuo', 'Eastern Tucanoan languages'): (None, 'east2698'), ('ceb', 'Visayan'): (None, 'bisa1268'), ('por', 'Sri Lanka Portuguese'): (None, 'mala1544'), ('brx', 'Tibeto-Burman languages'): (None, 'brah1260'), } with open('name_conflicts.tab', 'w') as fp: fp.write('iso\tafbo\tglottolog\tproposed iso\n') for i, name in enumerate(languages.keys()): md = languages[name] iso = md.pop('iso') if iso == 'cvn' and name == 'Quechua': iso = 'qvn' kw = dict(name=name, id=str(i+1), jsondata=md) if name in COORDS: kw['latitude'], kw['longitude'] = COORDS[name] elif slug(name) in coords: kw['latitude'], kw['longitude'] = coords[slug(name)] elif glottocoords.get(iso): kw['latitude'], kw['longitude'] = glottocoords[iso] if glottonames.get(iso) and slug(glottonames.get(iso)) != slug(name): fp.write(('%s\t%s\t%s\t%s\n' % ( iso, name, glottonames.get(iso), rglottonames.get(slug(name), ''))).encode('utf8')) if name == 'Meglenite Romanian': kw['name'] = 'Megleno Romanian' if not 'latitude' in kw: print(name) l = data.add(common.Language, name, **kw) iso, gc = iso_map.get((iso, name), (iso, None)) for code, type_ in [ (iso, common.IdentifierType.iso), (gc or glottocodes.get(iso), common.IdentifierType.glottolog) ]: if code: identifier = data.add( common.Identifier, code, id=code, name=code, type=type_.value) data.add( common.LanguageIdentifier, '%s-%s' % (code, l.id), identifier=identifier, language=l) include = sources.keys() + [ 'myersscottoncontact2002', 'myersscottonlanguage2007', 'meakinsborrowing2011', 'seifartprinciple2012', ] refdb = bibtex.Database.from_file(args.data_file('FSeifartZoteroLibrary14Nov2013.bib')) for rec in refdb: if slug(rec.id) in include: data.add(common.Source, slug(rec.id), _obj=bibtex2source(rec)) for i, name in enumerate(params): data.add(models.AffixFunction, name, id=str(i + 1), name=name) for id_, vd in pairs.items(): assert id_ in doc donor = data['Language'][vd['donor language'].strip()] recipient = data['Language'][vd['recipient language'].strip()] p = data.add( models.Pair, id_, id=str(id_), name=vd['pairs'].replace('Meglenite', 'Megleno'), area=recipient.jsondata['macroarea'], description=unicode(doc[id_]['comment']).replace('<h1', '<p').replace('</h1>', '</p>').replace('Meglenite', 'Megleno'), reliability=vd['reliability'], int_reliability=['high', 'mid', 'low'].index(vd['reliability']), count_interrel=int(vd[u'number of interrelated affixes']), count_borrowed=int(vd['number of borrowed affixes']), donor=donor, recipient=recipient) DBSession.flush() for i, param in enumerate(params): param_id = i + 1 value = vd[param] if value != '': vsid = '%s-%s' % (recipient.id, param_id) if vsid in data['ValueSet']: vs = data['ValueSet'][vsid] else: vs = data.add( common.ValueSet, vsid, id=vsid, parameter=data['AffixFunction'][param], language=recipient, contribution=contrib) data.add( models.waabValue, '%s-%s' % (id_, param_id), id='%s-%s' % (id_, param_id), pair=p, name='%s' % int(value), numeric=int(value), description='%s' % p, valueset=vs)
def prime_cache(args): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodically whenever data has been updated. """ compute_language_sources() return from time import time _s = time() def checkpoint(s, msg=None): n = time() print(n - s, msg or '') return n sql = """ select p.id, l.id, v.name from value as v, valueset as vs, language as l, parameter as p where v.valueset_pk = vs.pk and vs.language_pk = l.pk and vs.parameter_pk = p.pk """ datatriples = [(v[0], v[1], v[2]) for v in DBSession.execute(sql)] _s = checkpoint(_s, '%s values loaded' % len(datatriples)) flv = dict([(feature, dict(lvs)) for (feature, lvs) in grp(datatriples).items()]) _s = checkpoint(_s, 'triples grouped') clfps = list(get_clf_paths([row[0] for row in DBSession.execute("select id from language")])) _s = checkpoint(_s, '%s clfps loaded' % len(clfps)) features = {f.id: f for f in DBSession.query(Feature)} for (f, lv) in flv.items(): features[f].representation = len(lv) DBSession.flush() _s = checkpoint(_s, 'representation assigned') families = {f.id: f for f in DBSession.query(Family)} if False: fs = feature_stability(datatriples, clfps) _s = checkpoint(_s, 'feature_stability computed') for (f, (s, transitions, stationarity_p, synchronic_p)) in fs: print(f) stability = Stability( id=f.replace("GB", "S"), feature=features[f], parsimony_stability_value=s["stability"], parsimony_retentions=s["retentions"], parsimony_transitions=s["transitions"], jsondata={'diachronic_p': stationarity_p, "synchronic_p": synchronic_p}) DBSession.add(stability) for (i, (fam, (fromnode, tonode), (ft, tt))) in enumerate(transitions): DBSession.add(Transition( id="%s: %s->%s" % (f, fromnode, tonode), stability=stability, fromnode=get_name(fromnode), tonode=get_name(tonode), fromvalue=ft, tovalue=tt, family=families[fam], retention_innovation="Retention" if ft == tt else "Innovation")) DBSession.flush() _s = checkpoint(_s, 'stability and transitions loaded') imps = feature_dependencies(datatriples) _s = checkpoint(_s, 'feature_dependencies computed') if True: (H, V) = dependencies_graph([(v, f1, f2) for ((v, dstats), f1, f2) in imps]) _s = checkpoint(_s, 'dependencies_graph written') for (i, ((v, dstats), f1, f2)) in enumerate(imps): combinatory_status = ("primary" if (f1, f2) in H else ("epiphenomenal" if v > 0.0 else None)) if H else "N/A" DBSession.add(Dependency( id="%s->%s" % (f1, f2), strength=v, feature1=features[f1], feature2=features[f2], representation=dstats["representation"], combinatory_status=combinatory_status, jsondata=dstats)) DBSession.flush() _s = checkpoint(_s, 'dependencies loaded') coordinates = { lg.id: (lg.longitude, lg.latitude) for lg in DBSession.query(common.Language) .filter(common.Language.longitude != None) .filter(common.Language.latitude != None)} deepfams = deep_families(datatriples, clfps, coordinates=coordinates) _s = checkpoint(_s, '%s deep_families computed' % len(deepfams)) missing_families = set() data = Data() for ((l1, l2), support_value, significance, supports, f1c, f2c) in deepfams: dname = "proto-%s x proto-%s" % (glottolog_names[l1], glottolog_names[l2]) kmdistance = havdist(f1c, f2c) (f1lon, f1lat) = f1c if f1c else (None, None) (f2lon, f2lat) = f2c if f2c else (None, None) for li in [l1, l2]: if li not in families: missing_families.add(li) deepfam = DeepFamily( id=dname, support_value=support_value, significance=significance, family1=families.get(l1), family2=families.get(l2), family1_latitude = f1lat, family1_longitude = f1lon, family2_latitude = f2lat, family2_longitude = f2lon, geographic_plausibility = kmdistance) DBSession.add(deepfam) for (f, v1, v2, historical_score, independent_score, support_score) in supports: vid = ("%s: %s %s %s" % (f, v1, "==" if v1 == v2 else "!=", v2)).replace(".", "") #vname = ("%s|%s" % (v1, v2)).replace(".", "") #print vid, vname if vid not in data["Support"]: data.add( Support, vid, id = vid, historical_score = historical_score, independent_score = independent_score, support_score = support_score, value1= v1, value2 = v2, feature=features[f]) DBSession.add(HasSupport( id=dname + "-" + vid, deepfamily = deepfam, support = data["Support"][vid])) print('missing_families:') print(missing_families) DBSession.flush() _s = checkpoint(_s, 'deep_families loaded') compute_language_sources()
def main(args): # pragma: no cover global MAX_IDENTIFIER_PK with transaction.manager: MAX_IDENTIFIER_PK = DBSession.query( Identifier.pk).order_by(desc(Identifier.pk)).first()[0] gc_names = {i.name: i for i in DBSession.query(Identifier).filter( Identifier.type == 'name').filter(Identifier.description == 'Glottolog')} ma_map = get_map(Macroarea) languoids = dict((l.pk, l) for l in DBSession.query(Languoid)) with open(args.data_file(args.version, 'languoids.json')) as fp: for attrs in json.load(fp): ma = attrs.pop('macroarea', None) replacement = attrs.pop('replacement', None) hname = attrs.pop('hname', None) for name, enum in [('level', LanguoidLevel), ('status', LanguoidStatus)]: if name in attrs: attrs[name] = enum.from_string(attrs[name]) l = languoids.get(attrs['pk']) if l: for k, v in attrs.items(): if k == 'globalclassificationcomment': continue setattr(l, k, v) if len(l.hid or '') == 3: if not l.iso_code: create_identifier( None, l, name=l.hid, type=IdentifierType.iso.value) else: l = Languoid(**attrs) DBSession.add(l) languoids[l.pk] = l if len(attrs.get('hid', '')) == 3: create_identifier( None, l, name=attrs['hid'], type=IdentifierType.iso.value) if ma: l.macroareas.append(ma_map[ma]) create_identifier( gc_names.get(l.name), l, name=l.name, description='Glottolog', type='name') if hname: l.update_jsondata(hname=hname) if replacement: DBSession.add(Superseded( languoid_pk=l.pk, replacement_pk=replacement, relation='classification update')) DBSession.flush() recreate_treeclosure()
def populate_test_db(engine): set_alembic_version(engine, '58559d4eea0d') data = TestData() data.add_default( cm.Dataset, domain='clld', jsondata={'license_icon': 'cc-by', 'license_url': 'http://example.org'}) data.add_default(cm.Contributor, name='A Name', email='*****@*****.**') for id_, name in { 'b': 'b Name', 'c': 'c Name', 'd': 'd Name'}.items(): data.add(cm.Contributor, id_, id=id_, name=name, url='http://example.org') DBSession.add( cm.Editor(dataset=data[cm.Dataset], contributor=data[cm.Contributor])) data.add_default(cm.Source) data.add( cm.Source, 'replaced', id='replaced', active=False, jsondata={'__replacement_id__': 'source'}) data.add_default(cm.Contribution) cm.ContributionReference(contribution=data[cm.Contribution], source=data[cm.Source]) for primary, c in [(True, 'contributor'), (False, 'b'), (True, 'c'), (False, 'd')]: cm.ContributionContributor( contribution=data[cm.Contribution], primary=primary, contributor=data['Contributor'][c]) data.add_default(cm.Language, latitude=10.5, longitude=0.3) data[cm.Language].sources.append(data[cm.Source]) for i, type_ in enumerate(cm.IdentifierType): cm.LanguageIdentifier( language=data[cm.Language], identifier=cm.Identifier(type=type_.value, id=type_.value + str(i), name='a')) cm.LanguageIdentifier( language=data[cm.Language], identifier=cm.Identifier(type='name', id='name', name='a')) for i in range(2, 102): _l = cm.Language(id='l%s' % i, name='Language %s' % i) _i = cm.Identifier(type='iso639-3', id='%.3i' % i, name='%.3i' % i) cm.LanguageIdentifier(language=_l, identifier=_i) DBSession.add(_l) param = data.add_default(cm.Parameter) de = cm.DomainElement(id='de', name='DomainElement', parameter=param) de2 = cm.DomainElement(id='de2', name='DomainElement2', parameter=param) valueset = data.add_default( cm.ValueSet, language=data[cm.Language], parameter=param, contribution=data[cm.Contribution]) cm.ValueSetReference(valueset=valueset, source=data[cm.Source], description='10-20') data.add_default( cm.Value, domainelement=de, valueset=valueset, frequency=50, confidence='high') data.add( cm.Value, 'value2', id='value2', domainelement=de2, valueset=valueset, frequency=50, confidence='high') paramnd = data.add( cm.Parameter, 'no-domain', id='no-domain', name='Parameter without domain') valueset = cm.ValueSet( id='vs2', language=data[cm.Language], parameter=paramnd, contribution=data[cm.Contribution]) cm.ValueSetReference(valueset=valueset, source=data[cm.Source], description='10-20') cm.Value(id='v2', valueset=valueset, frequency=50, confidence='high') unit = data.add_default(cm.Unit, language=data[cm.Language]) up = data.add_default(cm.UnitParameter) cm.UnitValue( id='unitvalue', name='UnitValue', unit=unit, unitparameter=up) up2 = cm.UnitParameter(id='up2', name='UnitParameter with domain') de = cm.UnitDomainElement(id='de', name='de', parameter=up2) DBSession.add(cm.UnitValue( id='uv2', name='UnitValue2', unit=unit, unitparameter=up2, unitdomainelement=de)) DBSession.add(cm.Source(id='s')) sentence = data.add_default( cm.Sentence, description='sentence description', analyzed='a\tmorpheme\tdoes\tdo', gloss='a\tmorpheme\t1SG\tdo.SG2', source='own', comment='comment', original_script='a morpheme', language=data[cm.Language], jsondata={'alt_translation': 'Spanish: ...'}) cm.SentenceReference(sentence=sentence, source=data[cm.Source]) DBSession.add(cm.Config(key='key', value='value')) cm.Config.add_replacement('replaced', 'language', model=cm.Language) cm.Config.add_replacement('gone', None, model=cm.Language) DBSession.flush()
def setUp(self): TestWithDb.setUp(self) DBSession.add(common.Dataset( id='dataset', name='dataset', description='desc', domain='clld', jsondata={'license_icon': 'cc-by'})) DBSession.add(common.Source( id='replaced', active=False, jsondata={'__replacement_id__': 'source'})) source = common.Source(id='source') contributors = { 'contributor': 'A Name', 'b': 'b Name', 'c': 'c Name', 'd': 'd Name'} for id_, name in contributors.items(): contributors[id_] = common.Contributor( id=id_, name=name, url='http://example.org') contribution = common.Contribution(id='contribution', name='Contribution') common.ContributionReference(contribution=contribution, source=source) assert common.ContributionContributor( contribution=contribution, primary=True, contributor=contributors['contributor']) assert common.ContributionContributor( contribution=contribution, primary=False, contributor=contributors['b']) assert common.ContributionContributor( contribution=contribution, primary=True, contributor=contributors['c']) assert common.ContributionContributor( contribution=contribution, primary=False, contributor=contributors['d']) DBSession.add(contribution) language = common.Language( id='language', name='Language 1', latitude=10.5, longitude=0.3) language.sources.append(source) for i, type_ in enumerate(common.IdentifierType): id_ = common.Identifier(type=type_.value, id=type_.value + str(i), name='abc') common.LanguageIdentifier(language=language, identifier=id_) for i in range(2, 102): _l = common.Language(id='l%s' % i, name='Language %s' % i) _i = common.Identifier(type='iso639-3', id='%.3i' % i, name='%.3i' % i) common.LanguageIdentifier(language=_l, identifier=_i) DBSession.add(_l) param = common.Parameter(id='parameter', name='Parameter') de = common.DomainElement(id='de', name='DomainElement', parameter=param) de2 = common.DomainElement(id='de2', name='DomainElement2', parameter=param) valueset = common.ValueSet( id='valueset', language=language, parameter=param, contribution=contribution) value = common.Value( id='value', domainelement=de, valueset=valueset, frequency=50, confidence='high') DBSession.add(value) value2 = common.Value( id='value2', domainelement=de2, valueset=valueset, frequency=50, confidence='high') DBSession.add(value2) paramnd = common.Parameter(id='no-domain', name='Parameter without domain') valueset = common.ValueSet( id='vs2', language=language, parameter=paramnd, contribution=contribution) common.ValueSetReference(valueset=valueset, source=source) value = common.Value(id='v2', valueset=valueset, frequency=50, confidence='high') DBSession.add(value) unit = common.Unit(id='unit', name='Unit', language=language) up = common.UnitParameter(id='unitparameter', name='UnitParameter') DBSession.add(unit) DBSession.add(common.UnitValue( id='unitvalue', name='UnitValue', unit=unit, unitparameter=up)) up2 = common.UnitParameter(id='up2', name='UnitParameter with domain') de = common.UnitDomainElement(id='de', name='de', parameter=up2) DBSession.add(common.UnitValue( id='uv2', name='UnitValue2', unit=unit, unitparameter=up2, unitdomainelement=de)) DBSession.add(common.Source(id='s')) sentence = common.Sentence( id='sentence', name='sentence name', description='sentence description', analyzed='a\tmorpheme\tdoes\tdo', gloss='a\tmorpheme\t1SG\tdo.SG2', source='own', comment='comment', original_script='a morpheme', language=language, jsondata={'alt_translation': 'Spanish: ...'}) common.SentenceReference(sentence=sentence, source=source) DBSession.add(common.Config(key='key', value='value')) common.Config.add_replacement('replaced', 'language', model=common.Language) common.Config.add_replacement('gone', None, model=common.Language) DBSession.flush()