def test_Sources(tmpdir): src = Sources() src.add(BIB, Source( 'book', 'huber2005', author='Herrmann Huber', year='2005', title='y')) for entry in src: assert entry.genre == 'book' break assert len(list(src.items())) == 3 assert len(list(src.keys())) == 3 refs = ['huber2005[1-6]', 'Obrazy', 'Elegie[34]'] assert src.format_refs(*list(src.expand_refs(refs))) == refs assert '%s' % src['huber2005'] == 'Huber, Herrmann. 2005. y.' with pytest.raises(ValueError): src.add(5) with pytest.raises(ValueError): src.add('@misc{a.b,\n author="a.b"\n}') with pytest.raises(ValueError): _ = src['unknown'] assert _ # pragma: no cover with pytest.raises(ValueError): src.parse('a[x') with pytest.raises(ValueError): src.parse('[x]') with pytest.raises(ValueError): src.validate(['x']) bib = str(tmpdir / 'test.bib') src.write(bib) src2 = Sources() src2.read(bib) src2.write(bib, ids=['huber2005']) src = Sources.from_file(bib) assert len(src) == 1
def cmd_makecldf(self, args): """ Convert the raw data to a CLDF dataset. """ with self.cldf_writer(args) as writer: writer.cldf.add_component('CognatesetTable') writer.add_sources(*self.raw_dir.read_bib('cariban_resolved.bib')) cmap = writer.add_concepts(lookup_factory=lambda c: c.english) cmap['you'] = cmap['thou'] cmap['grease/fat'] = cmap['grease'] cmap['breast'] = cmap['breasts'] cmap['son'] = cmap['person'] data = pycldf.Dataset.from_metadata(self.raw_dir / 'cariban_data.json') for lang in data['LanguageTable']: writer.add_language(ID=lang['ID'], Name=lang['Name'], Glottocode=lang["Glottocode"]) cs_seen = set() reconstructions = { tuple(c['ID'].split('-')): c['Form'] for c in self.raw_dir.read_csv( 'cariban_lexical_reconstructions.csv', dicts=True) } for lex in self.raw_dir.read_csv('cariban_swadesh_list.csv', dicts=True): #"Language_ID","Swadesh_Nr","Feature_ID","Value","Cognateset_ID","Source","Comment","Full_Form" if lex['Feature_ID'] not in cmap: print(lex['Feature_ID']) continue for form in writer.add_lexemes( Value=lex['Value'], Parameter_ID=cmap[lex['Feature_ID']], Language_ID=lex['Language_ID'], Source=[ Reference(*d) for d in [Sources.parse(lex['Source'].replace(';', ','))] ] if lex['Source'] and not lex['Source'].startswith('pc') else [], ): cs_key = (lex['Feature_ID'], lex['Cognateset_ID']) cs_id = '{}-{}'.format(cmap[cs_key[0]], cs_key[1]) if cs_key not in cs_seen: writer.objects['CognatesetTable'].append( dict( ID=cs_id, Description=reconstructions.get(cs_key), )) cs_seen.add(cs_key) writer.add_cognate(lexeme=form, Cognateset_ID=cs_id) # Note: We want to re-use LanguageTable across the two CLDF datasets: LanguageTable = writer.cldf['LanguageTable'] with self.cldf_writer(args, cldf_spec='structure', clean=False) as writer: writer.cldf.add_component( LanguageTable) # we reuse the one from above!
def import_values(values, lang, features, codes, contributors, sources): # pragma: no cover c = Contribution( id=lang['ID'], name='Dataset for {0}'.format(lang['Name']), ) for i, cid in enumerate(lang['Coders'], start=1): DBSession.add( ContributionContributor( contribution=c, contributor_pk=contributors[cid], ord=i, )) l = GrambankLanguage( id=lang['ID'], name=lang['Name'], macroarea=lang['Macroarea'], latitude=lang['Latitude'], longitude=lang['Longitude'], ) for value in values: vs = ValueSet( id=value['ID'], parameter_pk=features[value['Parameter_ID']], language=l, contribution=c, ) Value(id=value['ID'], valueset=vs, name=value['Value'], description=value['Comment'], domainelement_pk=codes[value['Code_ID'] or '{}-NA'.format(value['Parameter_ID'])]) if value['Source']: for ref in value['Source']: sid, pages = Sources.parse(ref) ValueSetReference(valueset=vs, source_pk=sources[sid], description=pages) DBSession.add(c)
def load(self, dataset): """ Load a CLDF dataset into the database. :param dataset: :return: """ tables, ref_tables = schema(dataset) # update the DB schema: for t in tables: if self._create_table_if_not_exists(t): continue db_cols = { r[1]: r[2] for r in self.fetchall("PRAGMA table_info({0})".format(t.name)) } for col in t.columns: if col.name not in db_cols: with self.connection() as conn: conn.execute( "ALTER TABLE {0} ADD COLUMN \"{1.name}\" {1.db_type}" .format(t.name, col)) else: if db_cols[col.name] != col.db_type: raise ValueError( 'column {0}:{1} {2} redefined with new type {3}'. format(t.name, col.name, db_cols[col.name], col.db_type)) for t in ref_tables.values(): self._create_table_if_not_exists(t) # then load the data: with self.connection() as db: db.execute('PRAGMA foreign_keys = ON;') pk = max([ r[0] for r in self.fetchall("SELECT ID FROM dataset", conn=db) ] or [0]) + 1 insert(db, 'dataset', 'ID,name,module,metadata_json', (pk, '{0}'.format(dataset), dataset.module, dumps(dataset.metadata_dict))) insert( db, 'datasetmeta', 'dataset_ID,key,value', *[(pk, k, '{0}'.format(v)) for k, v in dataset.properties.items()]) # load sources: rows = [] for src in dataset.sources.items(): values = [pk, src.id, src.genre ] + [src.get(k) for k in BIBTEX_FIELDS] values.append( dumps({ k: v for k, v in src.items() if k not in BIBTEX_FIELDS })) rows.append(tuple(values)) insert(db, 'SourceTable', ['dataset_ID', 'ID', 'bibtex_type'] + BIBTEX_FIELDS + ['extra'], *rows) # For regular tables, we extract and keep references to sources. refs = defaultdict(list) for t in tables: cols = {col.name: col for col in t.columns} ref_table = ref_tables.get(t.name) rows, keys = [], [] for row in dataset[t.name]: keys, values = ['dataset_ID'], [pk] for k, v in row.items(): if ref_table and k == ref_table.consumes: refs[ref_table.name].append( (row[t.primary_key], v)) else: col = cols[k] if isinstance(v, list): v = (col.separator or ';').join(col.convert(vv) for vv in v) else: v = col.convert(v) keys.append(k) values.append(v) rows.append(tuple(values)) insert(db, t.name, keys, *rows) # Now insert the references, i.e. the associations with sources: for tname, items in refs.items(): rows = [] for oid, sources in items: for source in sources: sid, context = Sources.parse(source) rows.append([pk, oid, sid, context]) oid_col = '{0}_ID'.format(tname.replace('Source', '')) insert(db, tname, [ 'dataset_ID', '{:}'.format(oid_col), 'Source_ID', 'Context' ], *rows) db.commit()
def load(self, ds, args=None, verbose=False): """ Load a CLDF dataset into the database. :param dataset: :return: """ print(ds) try: self.fetchone('select ID from dataset') except sqlite3.OperationalError: self.create(force=True) self.unload(ds) dataset = ds.cldf_reader() tables, ref_tables = schema(dataset) # update the DB schema: for t in tables: if self._create_table_if_not_exists(t): continue db_cols = {k.lower(): v for k, v in self.tables[t.name].items()} for col in t.columns: if col.name.lower() not in db_cols: with self.connection() as conn: conn.execute( "ALTER TABLE {0} ADD COLUMN `{1.name}` {1.db_type}".format( t.name, col)) else: if db_cols[col.name.lower()] != col.db_type: raise ValueError( 'column {0}:{1} {2} redefined with new type {3}'.format( t.name, col.name, db_cols[col.name.lower()], col.db_type)) for t in ref_tables.values(): self._create_table_if_not_exists(t) self.update_schema() # then load the data: with self.connection() as db: db.execute('PRAGMA foreign_keys = ON;') insert( db, 'dataset', 'ID,name,version,metadata_json', ( ds.id, '{0}'.format(dataset), ds.repo.hash() if ds.repo else '', json.dumps(dataset.metadata_dict))) insert( db, 'datasetmeta', 'dataset_ID,key,value', *[(ds.id, k, '{0}'.format(v)) for k, v in dataset.properties.items()]) # load sources: rows = [] for src in dataset.sources.items(): values = [ds.id, src.id, src.genre] + [src.get(k) for k in BIBTEX_FIELDS] values.append( json.dumps({k: v for k, v in src.items() if k not in BIBTEX_FIELDS})) rows.append(tuple(values)) insert( db, 'SourceTable', ['dataset_ID', 'ID', 'bibtex_type'] + BIBTEX_FIELDS + ['extra'], *rows) # For regular tables, we extract and keep references to sources. refs = collections.defaultdict(list) for t in tables: # We want to lookup columns by the name used in the CLDF dataset. cols = {col.cldf_name: col for col in t.columns} # But we also want to look up primary keys by the database column name. cols_by_name = {col.name: col for col in t.columns} ref_table = ref_tables.get(t.name) rows, keys = [], [] try: for row in dataset[t.name]: keys, values = ['dataset_ID'], [ds.id] for k, v in row.items(): if ref_table and k == ref_table.consumes: col = cols_by_name[t.primary_key] refs[ref_table.name].append((row[col.cldf_name], v)) else: col = cols[k] if isinstance(v, list): v = (col.separator or ';').join( nfilter(col.convert(vv) for vv in v)) else: v = col.convert(v) # FIXME: only if non-local! keys.append("`{0}`".format(col.name)) values.append(v) keys, values = self.update_row(t.name, keys, values) rows.append(tuple(values)) insert(db, t.name, keys, *rows, **{'verbose': verbose}) except FileNotFoundError: # pragma: no cover if t.name != 'CognateTable': # An empty CognateTable is allowed. raise # pragma: no cover # Now insert the references, i.e. the associations with sources: for tname, items in refs.items(): rows = [] for oid, sources in items: for source in sources: sid, context = Sources.parse(source) rows.append([ds.id, oid, sid, context]) oid_col = '{0}_ID'.format(tname.replace('Source', '')) insert(db, tname, ['dataset_ID', oid_col, 'Source_ID', 'Context'], *rows) db.commit()