def test_write_read_archive(self): from pycldf.dataset import Dataset from pycldf.util import Archive ds = Dataset.from_file(FIXTURES.joinpath('ds1.csv')) out = self.tmp_path() with self.assertRaises(ValueError): ds.write(out.joinpath('non-existing'), '.tsv', archive=True) with Archive(self.tmp_path('archive.zip').as_posix(), 'w') as archive: ds.write('.', archive=archive) ds2 = Dataset.from_file(FIXTURES.joinpath('ds1.csv')) ds2.name = 'new_name' ds2.write('.', archive=archive) ds_out = Dataset.from_zip(self.tmp_path('archive.zip'), name='ds1') self.assertEqual(ds.rows, ds_out.rows) self.assertEqual(ds.metadata, ds_out.metadata) with Archive(self.tmp_path('archive.zip')) as archive: ds_out = Dataset.from_metadata('ds1.csv-metadata.json', container=archive) self.assertEqual(ds.rows, ds_out.rows) self.assertEqual(ds.metadata, ds_out.metadata) ds.write(out, '.tsv', archive=True) ds_out = Dataset.from_zip(out.joinpath('ds1.zip')) self.assertEqual(ds.rows, ds_out.rows) self.assertEqual(ds.metadata, ds_out.metadata)
def stats(args): """ cldf stats <DATASET> Print basic stats for CLDF dataset <DATASET>, where <DATASET> may be the path to - a CLDF metadata file - a CLDF core data file - a CLDF zip archive """ if len(args.args) < 1: raise ParserError('not enough arguments') fname = Path(args.args[0]) if not fname.exists() or not fname.is_file(): raise ParserError('%s is not an existing directory' % fname) if fname.suffix == '.zip': ds = Dataset.from_zip(fname) elif fname.name.endswith(MD_SUFFIX): ds = Dataset.from_metadata(fname) else: ds = Dataset.from_file(fname) print(fname) stats_ = ds.stats print(""" Name: %s Different languages: %s Different parameters: %s Rows: %s """ % ( ds.name, len(stats_['languages']), len(stats_['parameters']), stats_['rowcount'] ))
def get_dataset(fname): """Load a CLDF dataset. Load the file as `json` CLDF metadata description file, or as metadata-free dataset contained in a single csv file. The distinction is made depending on the file extension: `.json` files are loaded as metadata descriptions, all other files are matched against the CLDF module specifications. Directories are checked for the presence of any CLDF datasets in undefined order of the dataset types. Parameters ---------- fname : str or Path Path to a CLDF dataset Returns ------- Dataset """ fname = Path(fname) if not fname.exists(): raise FileNotFoundError( '{:} does not exist'.format(fname)) if fname.suffix == '.json': return Dataset.from_metadata(fname) return Dataset.from_data(fname)
def make_dataset(name='test', fields=None, rows=None): ds = Dataset(name) ds.fields = fields or tuple(f[0] for f in REQUIRED_FIELDS) if rows: for row in rows: ds.add_row(row) return ds
def test_dataset_from_file(self): from pycldf.dataset import Dataset ds = Dataset.from_file(FIXTURES.joinpath('ds1.csv')) self.assertIn('ds1', repr(ds)) self.assertEqual(len(ds), 2) self.assertEqual(ds.table.url, 'ds1.csv') self.assertEqual(ds.metadata['dc:creator'], 'The Author') row = ['3', 'abcd1234', 'fid2', 'maybe', '', 'new[4]'] with self.assertRaises(ValueError): ds.add_row(row) ds.sources.add('@book{new,\nauthor={new author}}') res = ds.add_row(row) self.assertEqual(res.url, 'http://example.org/valuesets/3') self.assertEqual(len(res.refs), 1) self.assertEqual( res.valueUrl('Language_ID'), 'http://glottolog.org/resource/languoid/id/abcd1234') res = ds.add_row(['4', None, None, None, None, None]) self.assertEqual(res.valueUrl('Language_ID'), None) out = self.tmp_path() ds.write(out, '.tsv') self.assertTrue(out.joinpath('ds1.bib').exists()) md = load(out.joinpath('ds1.tsv-metadata.json')) self.assertEqual('ds1.tsv', md['tables'][0]['url']) Dataset.from_file(out.joinpath('ds1.tsv'))
def test_dataset_from_metadata(self): from pycldf.dataset import Dataset ds = Dataset.from_metadata(FIXTURES.joinpath('ds1.csv-metadata.json')) self.assertIn('ds1', repr(ds)) with self.assertRaises(ValueError): Dataset.from_metadata(FIXTURES.joinpath('ds1.csv-me.json'))
def test_duplicate_component(ds, tmpdir): # adding a component twice is not possible: t = ds.add_component('ValueTable') t.url = Link('other.csv') with pytest.raises(ValueError): ds.add_component('ValueTable') # JSON descriptions with duplicate components cannot be read: md = tmpdir / 'md.json' json = """\ { "@context": ["http://www.w3.org/ns/csvw", {"@language": "en"}], "dc:conformsTo": "http://cldf.clld.org/v1.0/terms.rdf#StructureDataset", "tables": [ {"url": "values.csv"}, COMPS ] }""" comp = """ { "url": "values.csv", "dc:conformsTo": "http://cldf.clld.org/v1.0/terms.rdf#ValueTable", "tableSchema": { "columns": [ { "name": "ID", "propertyUrl": "http://cldf.clld.org/v1.0/terms.rdf#id" }, { "name": "Language_ID", "propertyUrl": "http://cldf.clld.org/v1.0/terms.rdf#languageReference" }, { "name": "Parameter_ID", "propertyUrl": "http://cldf.clld.org/v1.0/terms.rdf#parameterReference" }, { "name": "Value", "propertyUrl": "http://cldf.clld.org/v1.0/terms.rdf#value" } ] } }""" with warnings.catch_warnings(record=True): warnings.simplefilter("always") md.write_text(json.replace('COMPS', comp), encoding='utf8') (tmpdir / 'values.csv').write_text( "ID,Language_ID,Parameter_ID,Value\n1,1,1,1", encoding='utf8') ds = Dataset.from_metadata(str(md)) assert ds.validate() md.write_text(json.replace('COMPS', ', '.join([comp, comp])), encoding='utf8') with pytest.raises(ValueError) as excinfo: Dataset.from_metadata(str(md)) assert 'duplicate component' in excinfo.exconly()
def _get_dataset(args): if len(args.args) < 1: raise ParserError('not enough arguments') fname = Path(args.args[0]) if not fname.exists() or not fname.is_file(): raise ParserError('%s is not an existing directory' % fname) if fname.suffix == '.json': return Dataset.from_metadata(fname) return Dataset.from_data(fname)
def test_invalid_dataset_from_file(self): from pycldf.dataset import Dataset log = Mock(warn=Mock()) with patch('pycldf.dataset.log', log): Dataset.from_file(FIXTURES.joinpath('invalid.csv'), skip_on_error=True) self.assertEqual(log.warn.call_count, 2) with self.assertRaises(ValueError): Dataset.from_file(FIXTURES.joinpath('invalid.csv'))
def test_modules(tmpdir): ds = Dataset(_make_tg(tmpdir)) assert ds.primary_table is None ds = Dataset(_make_tg(tmpdir, {"url": "data.csv"})) assert ds.primary_table is None ds = Dataset(_make_tg(tmpdir, { "url": "data.csv", "dc:conformsTo": "http://cldf.clld.org/v1.0/terms.rdf#ValueTable"})) assert ds.primary_table == 'ValueTable' assert Wordlist.in_dir(str(tmpdir)).primary_table assert Dictionary.in_dir(str(tmpdir)).primary_table assert StructureDataset.in_dir(str(tmpdir)).primary_table
def test_duplicate_component(ds, tmpdir): # adding a component twice is not possible: ds.add_component('ValueTable') with pytest.raises(ValueError): ds.add_component('ValueTable') # JSON descriptions with duplicate components cannot be read: md = tmpdir / 'md.json' json = """\ { "@context": ["http://www.w3.org/ns/csvw", {"@language": "en"}], "dc:conformsTo": "http://cldf.clld.org/v1.0/terms.rdf#StructureDataset", "tables": [ {"url": "values.csv"}, COMPS ] }""" comp = """ { "url": "values.csv", "dc:conformsTo": "http://cldf.clld.org/v1.0/terms.rdf#ValueTable", "tableSchema": { "columns": [ { "name": "ID", "propertyUrl": "http://cldf.clld.org/v1.0/terms.rdf#id" }, { "name": "Language_ID", "propertyUrl": "http://cldf.clld.org/v1.0/terms.rdf#languageReference" }, { "name": "Parameter_ID", "propertyUrl": "http://cldf.clld.org/v1.0/terms.rdf#parameterReference" }, { "name": "Value", "propertyUrl": "http://cldf.clld.org/v1.0/terms.rdf#value" } ] } }""" md.write_text(json.replace('COMPS', comp), encoding='utf8') (tmpdir / 'values.csv').write_text( "ID,Language_ID,Parameter_ID,Value\n1,1,1,1", encoding='utf8') ds = Dataset.from_metadata(str(md)) assert ds.validate() md.write_text(json.replace('COMPS', ', '.join([comp, comp])), encoding='utf8') with pytest.raises(ValueError) as excinfo: Dataset.from_metadata(str(md)) assert 'duplicate component' in excinfo.exconly()
def test_Dataset_from_scratch(tmpdir, data): # An unknown file name cannot be used with Dataset.from_data: copy(str(data / 'ds1.csv'), str(tmpdir / 'xyz.csv')) with pytest.raises(ValueError): Dataset.from_data(str(tmpdir / 'xyz.csv')) # Known file name, but non-standard column name: write_text(str(tmpdir / 'values.csv'), "IDX,Language_ID,Parameter_ID,Value\n1,1,1,1") with pytest.raises(ValueError, match='missing columns'): ds = Dataset.from_data(str(tmpdir / 'values.csv')) # A known file name will determine the CLDF module of the dataset: copy(str(data / 'ds1.csv'), str(tmpdir / 'values.csv')) ds = Dataset.from_data(str(tmpdir / 'values.csv')) assert ds.module == 'StructureDataset' assert len(list(ds['ValueTable'])) == 2 ds.validate() ds['ValueTable'].write(2 * list(ds['ValueTable'])) with pytest.raises(ValueError): ds.validate() md = ds.write_metadata() Dataset.from_metadata(md) repr(ds) del ds.tablegroup.common_props['dc:conformsTo'] Dataset.from_metadata(ds.write_metadata()) assert len(ds.stats()) == 1
def test_validate(self): from pycldf.dataset import Dataset, REQUIRED_FIELDS ds = Dataset('name') with self.assertRaises(AssertionError): # missing required fields! ds.fields = ('a',) with self.assertRaises(AssertionError): # fields must be tuple ds.fields = [variants[-1] for variants in REQUIRED_FIELDS] ds.fields = tuple(variants[-1] for variants in REQUIRED_FIELDS) with self.assertRaises(ValueError): # fields cannot be reassigned! ds.fields = tuple(variants[0] for variants in REQUIRED_FIELDS)
def test_Dataset_from_scratch(tmpdir, data): # An unknown file name cannot be used with Dataset.from_data: copy(str(data / 'ds1.csv'), str(tmpdir / 'xyz.csv')) with pytest.raises(ValueError): Dataset.from_data(str(tmpdir / 'xyz.csv')) # Known file name, but non-standard column name: write_text(str(tmpdir / 'values.csv'), "IDX,Language_ID,Parameter_ID,Value\n1,1,1,1") with pytest.raises(ValueError, match='missing columns'): ds = Dataset.from_data(str(tmpdir / 'values.csv')) # A known file name will determine the CLDF module of the dataset: copy(str(data / 'ds1.csv'), str(tmpdir / 'values.csv')) ds = Dataset.from_data(str(tmpdir / 'values.csv')) assert ds.module == 'StructureDataset' assert len(list(ds['ValueTable'])) == 2 ds.validate() ds['ValueTable'].write(2 * list(ds['ValueTable'])) with pytest.raises(ValueError): ds.validate() md = ds.write_metadata() Dataset.from_metadata(md) repr(ds) del ds.tablegroup.common_props['dc:conformsTo'] Dataset.from_metadata(ds.write_metadata()) assert len(ds.stats()) == 1 ds.add_table('extra.csv', 'ID') ds.write(**{'ValueTable': [], 'extra.csv': []}) counts = {r[0]: r[2] for r in ds.stats()} assert counts['extra.csv'] == 0
def test_Dataset_from_scratch(tmpdir, data): # An unknown file name cannot be used with Dataset.from_data: copy(str(data / 'ds1.csv'), str(tmpdir / 'xyz.csv')) with pytest.raises(ValueError): Dataset.from_data(str(tmpdir / 'xyz.csv')) # Known file name, but non-standard column name: Path(str(tmpdir / 'values.csv')).write_text( "IDX,Language_ID,Parameter_ID,Value\n1,1,1,1", encoding='utf-8') with pytest.raises(ValueError, match='missing columns'): ds = Dataset.from_data(str(tmpdir / 'values.csv')) # A known file name will determine the CLDF module of the dataset: copy(str(data / 'ds1.csv'), str(tmpdir / 'values.csv')) with warnings.catch_warnings(record=True): warnings.simplefilter("always") ds = Dataset.from_data(str(tmpdir / 'values.csv')) assert ds.module == 'StructureDataset' assert len(list(ds['ValueTable'])) == 2 ds.validate() ds['ValueTable'].write(2 * list(ds['ValueTable'])) with pytest.raises(ValueError): ds.validate() md = ds.write_metadata() Dataset.from_metadata(md) repr(ds) del ds.tablegroup.common_props['dc:conformsTo'] Dataset.from_metadata(ds.write_metadata()) assert len(ds.stats()) == 1 ds.add_table('extra.csv', 'ID') ds.write(**{'ValueTable': [], 'extra.csv': []}) counts = {r[0]: r[2] for r in ds.stats()} assert counts['extra.csv'] == 0
def __attrs_post_init__(self): if self.default_metadata_path: self.default_metadata_path = pathlib.Path( self.default_metadata_path) try: Dataset.from_metadata(self.default_metadata_path) except Exception: raise ValueError('invalid default metadata: {0}'.format( self.default_metadata_path)) else: self.default_metadata_path = pkg_path( 'modules', '{0}{1}'.format(self.module, MD_SUFFIX)) if not self.metadata_fname: self.metadata_fname = self.default_metadata_path.name
def test_db_write(tmpdir, data): ds = Dataset.from_metadata(data / 'ds1.csv-metadata.json') db = Database(ds, fname=str(tmpdir.join('db.sqlite'))) db.write_from_tg() #shutil.copy(str(tmpdir.join('db.sqlite')), 'db.sqlite') assert len( db.query( "select * from ValueTable where cldf_parameterReference = 'fid1'") ) == 1 assert len(db.query('select * from SourceTable')) == 3 assert len( db.query( "select valuetable_cldf_id from ValueTable_SourceTable where context = '2-5'" )) == 1 assert db.read()['ValueTable'][0]['cldf_source'] == [ '80086', 'meier2015[2-5]' ] db.to_cldf(str(tmpdir.join('cldf'))) assert tmpdir.join('cldf', 'ds1.bib').check() assert '80086;meier2015[2-5]' in tmpdir.join('cldf', 'ds1.csv').read_text('utf8') with pytest.raises(ValueError): db.write_from_tg() with pytest.raises(NotImplementedError): db.write_from_tg(_exists_ok=True) db.write_from_tg(_force=True)
def test_db(data, db): ds = Dataset.from_metadata(str(data / 'ds1.csv-metadata.json')) db.create() db.load(ds) assert len(db.fetchall("SELECT name FROM dataset")) == 1 with pytest.raises(IntegrityError): db.load(ds) db.delete(db.fetchone("SELECT ID FROM dataset")[0]) db.load(ds) db.drop()
def add_row(self, row): # # add segments column, value cleaned from "<>=..." # row = CldfDatasetBase.add_row(self, row) if row: for col, validator in self.validators.items(): if not validator(row): del self._rows[row['ID']] return return row
def test_write_read(self): from pycldf.dataset import Dataset, REQUIRED_FIELDS row = ['1', 'abcd1234', 'fid', 'yes'] ds = Dataset('name') ds.fields = tuple(v[0] for v in REQUIRED_FIELDS) ds.add_row(row) ds.write(self.tmp_path()) self.assertTrue(self.tmp_path('name.csv').exists()) ds2 = Dataset.from_file(self.tmp_path('name.csv')) self.assertEqual(list(ds2[0].values()), row) self.assertEqual(list(ds2['1'].values()), row)
def test_db_write(tmpdir, data): #import shutil ds = Dataset.from_metadata(data / 'ds1.csv-metadata.json') db = Database(ds, fname=str(tmpdir.join('db.sqlite'))) db.write_from_tg() #shutil.copy(str(tmpdir.join('db.sqlite')), 'db.sqlite') assert len(db.query("select * from ValueTable where cldf_parameterReference = 'fid1'")) == 1 assert len(db.query('select * from SourceTable')) == 2 assert len(db.query("select * from ValueTable_SourceTable where context = '2-5'")) == 1 assert db.read()['ValueTable'][0]['cldf_source'] == ['80086', 'meier2015[2-5]'] db.to_cldf(str(tmpdir.join('cldf'))) assert tmpdir.join('cldf', 'ds1.bib').check() assert '80086;meier2015[2-5]' in tmpdir.join('cldf', 'ds1.csv').read_text('utf8')
def test_CldfDownload(self): from clld.web.adapters.cldf import CldfDownload tmp = self.tmp_path('dl.zip') dl = CldfDownload(Dataset, 'clld') dl.create(self.env['request'], verbose=False, outfile=tmp) ds = CldfDataset.from_zip(tmp) self.assertEqual(ds.name, 'dataset-contribution-contribution') self.assertEqual( 'http://localhost/values/{ID}', ds.table.schema.aboutUrl) self.assertEqual( 'http://localhost/languages/{Language_ID}', ds.table.schema.columns['Language_ID'].valueUrl) self.assertEqual(len(ds.rows), 3) self.assertIn('Language_glottocode', ds[0]) self.assertIn('10-20', ds['value2']['Source'])
def test_validators(tmpdir, mocker, data): copy(str(data / 'invalid.csv'), str(tmpdir / 'values.csv')) ds = Dataset.from_data(str(tmpdir / 'values.csv')) with pytest.raises(ValueError): ds.validate() log = mocker.Mock() ds.validate(log=log) assert log.warn.call_count == 2 for col in ds.tablegroup.tables[0].tableSchema.columns: if col.name == 'Language_ID': col.propertyUrl.uri = 'http://cldf.clld.org/v1.0/terms.rdf#glottocode' log = mocker.Mock() ds.validate(log=log) assert log.warn.call_count == 4
def import_cldf(srcdir, md, languoids, conceptsets): with transaction.manager: contrib = Provider( id=srcdir.name, name=md['dc:title'], description=md.get('dc:bibliographicCitation'), url=md.get('dc:identifier'), license=md.get('dc:license'), aboutUrl=md.get('aboutUrl'), ) DBSession.add(contrib) sources = {} cldfdir = srcdir.joinpath('cldf') values = Data() for fname in tqdm(list(cldfdir.glob('*' + MD_SUFFIX)), leave=False): ds = Dataset.from_metadata(fname) for src in ds.sources.items(): if src.id not in sources: sources[src.id] = cldf2clld(src, contrib, len(sources) + 1) import_dataset(ds, contrib, languoids, conceptsets, sources, values) DBSession.flush() # import cognates: if cldfdir.joinpath('cognates.csv').exists(): for csid, cognates in groupby( reader(cldfdir.joinpath('cognates.csv'), dicts=True), lambda i: i['Cognate_set_ID']): cs = Cognateset(id=unique_id(contrib, csid), contribution=contrib) for cognate in cognates: cp = values['Counterpart'].get(cognate['Word_ID']) if cp: DBSession.add( CognatesetCounterpart( cognateset=cs, counterpart=cp, cognate_detection_method=cognate[ 'Cognate_detection_method'], alignment=cognate['Alignment'], alignment_method=cognate['Alignment_method'], doubt=cognate['Doubt'] == 'True'))
def test_CldfDownload(env, tmppath, mocker, capsys): from clld.web.adapters.cldf import CldfDownload mocker.patch('clld.web.adapters.cldf.transaction') tmp = tmppath / 'dl.zip' dl = CldfDownload(Dataset, 'clld') dl.create(env['request'], outfile=tmp, verbose=True) out, err = capsys.readouterr() assert 'Value' in out outdir = tmppath / 'cldf' with ZipFile(tmp.as_posix()) as zip: assert 'Wordlist-metadata.json' in zip.namelist() zip.extractall(str(outdir)) ds = CldfDataset.from_metadata(outdir.joinpath('Wordlist-metadata.json')) assert ds.module == 'Wordlist' values = list(ds[ds.primary_table]) assert len(values) == 3 for v in values: list(ds.sources.expand_refs(v['Source']))
def import_cldf(srcdir, md, languoids, conceptsets): with transaction.manager: contrib = Provider( id=srcdir.name, name=md['dc:title'], description=md.get('dc:bibliographicCitation'), url=md.get('dc:identifier'), license=md.get('dc:license'), aboutUrl=md.get('aboutUrl'), ) DBSession.add(contrib) sources = {} cldfdir = srcdir.joinpath('cldf') values = Data() for fname in tqdm(list(cldfdir.glob('*' + MD_SUFFIX)), leave=False): ds = Dataset.from_metadata(fname) for src in ds.sources.items(): if src.id not in sources: sources[src.id] = cldf2clld(src, contrib, len(sources) + 1) import_dataset(ds, contrib, languoids, conceptsets, sources, values) DBSession.flush() # import cognates: if cldfdir.joinpath('cognates.csv').exists(): for csid, cognates in groupby( reader(cldfdir.joinpath('cognates.csv'), dicts=True), lambda i: i['Cognate_set_ID']): cs = Cognateset(id=unique_id(contrib, csid), contribution=contrib) for cognate in cognates: cp = values['Counterpart'].get(cognate['Word_ID']) if cp: DBSession.add(CognatesetCounterpart( cognateset=cs, counterpart=cp, cognate_detection_method=cognate['Cognate_detection_method'], alignment=cognate['Alignment'], alignment_method=cognate['Alignment_method'], doubt=cognate['Doubt'] == 'True'))
def areality(metadata, feature_id): # Read the CLDF Dataset specified by a metadata file: grambank = Dataset.from_metadata(metadata) for feature in grambank['ParameterTable']: if feature['ID'] == feature_id: break else: raise ValueError('unknown Grambank feature ID: {0}'.format(feature_id)) # Information about macroareas is stored in table with language metadata: area_map = {l['ID']: l['Macroarea'] for l in grambank['LanguageTable']} # We want to map numeric feature values to more descriptive ones, thus we # have to read the value descriptions from the parameters table: codes = {c['ID']: c['Description'] for c in grambank['CodeTable']} res = Counter() for value in grambank['ValueTable']: if value['Parameter_ID'] == feature_id: # We count the occurrences of each value by (area, code): res.update([( area_map[value['Language_ID']], codes[value['Code_ID']] if value['Code_ID'] else 'Not known')]) return feature, res
def itercldf(dataset, id_): archive = Archive(dataset.raw.joinpath('{0}.zip'.format(id_))) for name in archive.namelist(): if name.endswith(MD_SUFFIX): yield Dataset.from_metadata(Path(name), archive)
def read_cldf_data(config): """ Read CLDF data as lists of Python dictionaries. This function interfaces with `pycldf`. The tables and columns to extract are obtained from `*_fields` entries in `config`. Parameters ---------- config : dict A dictionary with the configurations. """ # Read dataset from metadata metadata = config["base_path"] / "demo_cldf" / "cldf-metadata.json" dataset = Dataset.from_metadata(metadata.as_posix()) # Transform the dataset in a Python datastructure (`cldf_data`) suitable # for Jinja template manipulation. `cldf_data` is a dictionary of # tables, where the key is the table_name and value is a dictionary # of `columns` (with the sorted list of column names, found in the # rows), and `rows`. `rows` is a list of dictionaries, with the # `value` to be reported and optionally other information (such as the # `url`) which may or may not be used by the template # (`value` is always used). # TODO: make conversion less explicit and with fewer loops # table.base -> /home/tresoldi/src/staticcldf/demo_cldf # table.url -> cognates.csv # table.local_name -> cognates.csv # for col in table.tableSchema.columns: # - col.datatype.base -> string, decimal # - col.header -> Alignment_Source # - col.name -> Alignment_Source # - col.propertyUrl -> None, http://cldf.clld.org/v1.0/terms.rdf#alignment # - col.valueUrl -> None, http://glottolog.org/resource/languoid/id/{glottolog_id} cldf_data = {} for table in dataset.tables: table_key = table.local_name.split(".")[0] column_names = [col.name for col in table.tableSchema.columns] valueUrls = [col.valueUrl for col in table.tableSchema.columns] datatypes = [col.datatype.base for col in table.tableSchema.columns] # Holder for the table values in the returned structure table_data = [] # Iterate over all rows for the current table for row in table: # Holder for the row in the returned structure row_data = [] # Iterate over all columns for the current row for column, valueUrl in zip(column_names, valueUrls): if not row[column]: value = "" elif isinstance(row[column], (list, tuple)): value = " ".join([str(value) for value in row[column]]) else: value = str(row[column]) if valueUrl: # Ugly replacement, but works with CLDF metadata # (assuming there is a single replacement) var_name = list(valueUrl.variable_names)[0] url = valueUrl.expand(**{var_name: value}) else: url = None # Append computed values to `row_data` row_data.append({"value": value, "url": url}) # Append current row to the table table_data.append(row_data) # Append contents to overall table column_data = [ {"name": name, "datatype": datatype} for name, datatype in zip(column_names, datatypes) ] cldf_data[table_key] = {"columns": column_data, "rows": table_data} # TODO: remove those which are all empty or None return cldf_data
def test_Dataset_from_data_empty_file(tmpdir): write_text(str(tmpdir / 'values.csv'), '') with pytest.raises(ValueError, match='empty data file'): Dataset.from_data(str(tmpdir / 'values.csv'))
def test_Dataset_from_data_empty_file(tmpdir): Path(str(tmpdir / 'values.csv')).write_text('', encoding='utf-8') with pytest.raises(ValueError, match='empty data file'): Dataset.from_data(str(tmpdir / 'values.csv'))
def dataset(self, req): ds = Dataset('%s-%s-%s' % ( req.dataset.id, self.obj.__class__.__name__.lower(), self.obj.id)) cols = self.columns(req) ds.fields = tuple(col['name'] if isinstance(col, dict) else col for col in cols) ds.table.schema.aboutUrl = url_template(req, 'value', 'ID') for col in cols: if isinstance(col, dict): name = col.pop('name') for attr, value in col.items(): setattr(ds.table.schema.columns[name], attr, value) ds.metadata['dc:bibliographicCitation '] = text_citation(req, self.obj) ds.metadata['dc:publisher'] = '%s, %s' % ( req.dataset.publisher_name, req.dataset.publisher_place) ds.metadata['dc:license'] = req.dataset.license ds.metadata['dc:issued'] = req.dataset.published.isoformat() ds.metadata['dc:title'] = self.obj.name ds.metadata['dc:creator'] = self.obj.formatted_contributors() ds.metadata['dc:identifier'] = req.resource_url(self.obj) ds.metadata['dc:isPartOf'] = req.resource_url(req.dataset) ds.metadata['dcat:accessURL'] = req.route_url('download') for value in self.value_query(): refs, sources = self.refs_and_sources(req, value) row = self.row(req, value, refs) if row: ds.sources.add(*sources) ds.add_row(row) return ds