def test_Dataset_from_scratch(tmpdir, data): # An unknown file name cannot be used with Dataset.from_data: copy(str(data / 'ds1.csv'), str(tmpdir / 'xyz.csv')) with pytest.raises(ValueError): Dataset.from_data(str(tmpdir / 'xyz.csv')) # Known file name, but non-standard column name: write_text(str(tmpdir / 'values.csv'), "IDX,Language_ID,Parameter_ID,Value\n1,1,1,1") with pytest.raises(ValueError, match='missing columns'): ds = Dataset.from_data(str(tmpdir / 'values.csv')) # A known file name will determine the CLDF module of the dataset: copy(str(data / 'ds1.csv'), str(tmpdir / 'values.csv')) ds = Dataset.from_data(str(tmpdir / 'values.csv')) assert ds.module == 'StructureDataset' assert len(list(ds['ValueTable'])) == 2 ds.validate() ds['ValueTable'].write(2 * list(ds['ValueTable'])) with pytest.raises(ValueError): ds.validate() md = ds.write_metadata() Dataset.from_metadata(md) repr(ds) del ds.tablegroup.common_props['dc:conformsTo'] Dataset.from_metadata(ds.write_metadata()) assert len(ds.stats()) == 1
def test_copy(tmppath): from clldutils.path import copy src = make_file(tmppath, name='test', text='abc') dst = tmppath / 'other' copy(src, dst) assert src.stat().st_size == dst.stat().st_size
def test_copy(self): from clldutils.path import copy src = self.make_file('test') dst = self.tmp_path('other') copy(src, dst) self.assertEquals(src.stat().st_size, dst.stat().st_size)
def create_repos(dir_): tsammalexdata = dir_.join('tsammalexdata') tsammalexdata.mkdir() data = tsammalexdata.join('data') data.mkdir() with data.join('test.csv').open('w', encoding='utf8') as fp: fp.write("""\ a,b,c 1,2,3 4,5,6""") with data.join('distribution.csv').open('w', encoding='utf8') as fp: fp.write("id,coregions__ids,countries_ids") test_eco_path = fixture_path('test_ecoregions.json') eco_path = data.join('ecoregions.json') copy(Path(test_eco_path), Path(eco_path)) external = data.join('external') external.mkdir() with external.join('test.csv').open('w', encoding='utf8') as fp: fp.write("""\ a,b,c 1,2,3 4,5,6""") external.join('gbif').mkdir() occurrences = fixture_path('abelmoschusesculentus.json') copy(Path(occurrences), Path(external.join('gbif', occurrences.name))) return dir_
def test_Dataset_from_scratch(tmpdir, data): # An unknown file name cannot be used with Dataset.from_data: copy(str(data / 'ds1.csv'), str(tmpdir / 'xyz.csv')) with pytest.raises(ValueError): Dataset.from_data(str(tmpdir / 'xyz.csv')) # Known file name, but non-standard column name: write_text(str(tmpdir / 'values.csv'), "IDX,Language_ID,Parameter_ID,Value\n1,1,1,1") with pytest.raises(ValueError, match='missing columns'): ds = Dataset.from_data(str(tmpdir / 'values.csv')) # A known file name will determine the CLDF module of the dataset: copy(str(data / 'ds1.csv'), str(tmpdir / 'values.csv')) ds = Dataset.from_data(str(tmpdir / 'values.csv')) assert ds.module == 'StructureDataset' assert len(list(ds['ValueTable'])) == 2 ds.validate() ds['ValueTable'].write(2 * list(ds['ValueTable'])) with pytest.raises(ValueError): ds.validate() md = ds.write_metadata() Dataset.from_metadata(md) repr(ds) del ds.tablegroup.common_props['dc:conformsTo'] Dataset.from_metadata(ds.write_metadata()) assert len(ds.stats()) == 1 ds.add_table('extra.csv', 'ID') ds.write(**{'ValueTable': [], 'extra.csv': []}) counts = {r[0]: r[2] for r in ds.stats()} assert counts['extra.csv'] == 0
def test_Dataset_from_scratch(tmpdir, data): # An unknown file name cannot be used with Dataset.from_data: copy(str(data / 'ds1.csv'), str(tmpdir / 'xyz.csv')) with pytest.raises(ValueError): Dataset.from_data(str(tmpdir / 'xyz.csv')) # Known file name, but non-standard column name: Path(str(tmpdir / 'values.csv')).write_text( "IDX,Language_ID,Parameter_ID,Value\n1,1,1,1", encoding='utf-8') with pytest.raises(ValueError, match='missing columns'): ds = Dataset.from_data(str(tmpdir / 'values.csv')) # A known file name will determine the CLDF module of the dataset: copy(str(data / 'ds1.csv'), str(tmpdir / 'values.csv')) with warnings.catch_warnings(record=True): warnings.simplefilter("always") ds = Dataset.from_data(str(tmpdir / 'values.csv')) assert ds.module == 'StructureDataset' assert len(list(ds['ValueTable'])) == 2 ds.validate() ds['ValueTable'].write(2 * list(ds['ValueTable'])) with pytest.raises(ValueError): ds.validate() md = ds.write_metadata() Dataset.from_metadata(md) repr(ds) del ds.tablegroup.common_props['dc:conformsTo'] Dataset.from_metadata(ds.write_metadata()) assert len(ds.stats()) == 1 ds.add_table('extra.csv', 'ID') ds.write(**{'ValueTable': [], 'extra.csv': []}) counts = {r[0]: r[2] for r in ds.stats()} assert counts['extra.csv'] == 0
def repos(tmppath, git_repo_factory): repos = tmppath / 'lexibank-data' copytree(Path(__file__).parent.joinpath('repos'), repos) git_repo_factory(repos) git_repo_factory(repos / 'datasets' / 'test_dataset') git_repo_factory(repos / 'datasets' / 'test_dataset_cldf') copy(Path(pylexibank.__file__).parent.joinpath('cldf-metadata.json'), repos) yield repos
def test_check_new(fixturedir, capsys, mocker, tmpdir): from pyconcepticon.commands import check_new test = tmpdir.join('test.tsv') copy(fixturedir.joinpath('conceptlist2.tsv'), str(test)) check_new(mocker.Mock(args=[str(test)], repos=None)) out, err = capsys.readouterr() assert 'Gloss DUST' in out
def download_and_unpack_zipfiles(url, dataset, *paths): """Download zipfiles and immediately unpack the content""" with TemporaryDirectory() as tmpdir: urlretrieve(url, tmpdir.joinpath('ds.zip').as_posix()) with zipfile.ZipFile(tmpdir.joinpath('ds.zip').as_posix()) as zipf: for path in paths: zipf.extract(as_posix(path), path=tmpdir.as_posix()) copy(tmpdir.joinpath(path), dataset.raw)
def __init__(self, dataset): self._count = defaultdict(int) self._cognate_count = defaultdict(int) self.dataset = dataset md = self.dataset.cldf_dir / MD_NAME if not md.exists(): md = self.dataset.cldf_dir / ALT_MD_NAME if not md.exists(): md = self.dataset.cldf_dir / MD_NAME copy(Path(__file__).parent / MD_NAME, md) self.wl = Wordlist.from_metadata(md) default_cldf = Wordlist.from_metadata( Path(__file__).parent / 'cldf-metadata.json') self.objects = {} self._obj_index = {} for cls in [ self.dataset.lexeme_class, self.dataset.language_class, self.dataset.concept_class, self.dataset.cognate_class, ]: self.objects[cls.__cldf_table__()] = [] self._obj_index[cls.__cldf_table__()] = set() cols = set( col.header for col in self.wl[cls.__cldf_table__()].tableSchema.columns) properties = set( col.propertyUrl.uri for col in self.wl[cls.__cldf_table__()].tableSchema.columns if col.propertyUrl) for field in cls.fieldnames(): try: col = default_cldf[cls.__cldf_table__(), field] # # We added Latitude and Longitude to the default metadata later, and want to # make sure, existing datasets are upgraded silently. # if field in ['Latitude', 'Longitude'] \ and cls.__cldf_table__() == 'LanguageTable': properties.add(col.propertyUrl.uri) self.wl[cls.__cldf_table__(), field].propertyUrl = col.propertyUrl self.wl[cls.__cldf_table__(), field].datatype = col.datatype except KeyError: col = Column(name=field, datatype="string") if (col.propertyUrl and col.propertyUrl.uri not in properties) or \ ((not col.propertyUrl) and (field not in cols)): self.wl[cls.__cldf_table__()].tableSchema.columns.append( col)
def test_check(api, capsys, mocker, tmpdir, _main): test = tmpdir.join('Sun-1991-1004.tsv') copy(api.repos.joinpath('concepticondata/conceptlists/Sun-1991-1004.tsv'), str(test)) _main('check', str(test)) out, err = capsys.readouterr() assert 'Sun-1991-1004-2 ' not in out assert 'fast (adv.)' in out t = test.read_text(encoding='utf8') test.write_text(t.replace('Sun-1991-1004-1', 'Sun-1991-1004-2'), encoding='utf8') _main('check', str(test)) out, err = capsys.readouterr() print(out) assert 'Sun-1991-1004-2 ' in out
def test_link(self): from pyconcepticon.commands import link with self.assertRaises(ParserError): link(Mock(args=['.'])) def nattr(p, attr): return len(nfilter([getattr(i, attr, None) for i in read_all(p)])) test = self.tmp_path('test.tsv') copy(Path(__file__).parent.joinpath('fixtures', 'conceptlist.tsv'), test) self.assertEqual(nattr(test, 'CONCEPTICON_GLOSS'), 0) link(Mock(args=[test])) self.assertEqual(nattr(test, 'CONCEPTICON_GLOSS'), 1)
def test_check(fixturedir, capsys, mocker, tmpdir): from pyconcepticon.commands import check test = tmpdir.join('Sun-1991-1004.tsv') copy(fixturedir.joinpath('concepticondata/conceptlists/Sun-1991-1004.tsv'), str(test)) check(mocker.Mock(args=str(test), repos=fixturedir)) out, err = capsys.readouterr() assert '#1 FAST = "fast"' in out t = test.read_text(encoding='utf8') test.write_text(t.replace('1631', '111111'), encoding='utf8') check(mocker.Mock(args=str(test), repos=fixturedir)) out, err = capsys.readouterr() assert '#1 FAST = "fast' in out
def download_and_unpack(self, url, *paths, **kw): """ Download a zipfile and immediately unpack selected content. :param url: :param paths: :param kw: :return: """ with self.temp_download(url, 'ds.zip', log=kw.pop('log', None)) as zipp: with TemporaryDirectory() as tmpdir: with zipfile.ZipFile(zipp.as_posix()) as zipf: for path in paths: zipf.extract(as_posix(path), path=tmpdir.as_posix()) copy(tmpdir.joinpath(path), self)
def test_validators(tmpdir, mocker, data): copy(str(data / 'invalid.csv'), str(tmpdir / 'values.csv')) ds = Dataset.from_data(str(tmpdir / 'values.csv')) with pytest.raises(ValueError): ds.validate() log = mocker.Mock() ds.validate(log=log) assert log.warn.call_count == 2 for col in ds.tablegroup.tables[0].tableSchema.columns: if col.name == 'Language_ID': col.propertyUrl.uri = 'http://cldf.clld.org/v1.0/terms.rdf#glottocode' log = mocker.Mock() ds.validate(log=log) assert log.warn.call_count == 4
def test_link(fixturedir, tmpdir, capsys, _main): with pytest.raises(SystemExit): _main('link', '.') def nattr(p, attr): return len(nfilter([getattr(i, attr, None) for i in read_all(str(p))])) test = tmpdir.join('test.tsv') copy(fixturedir.joinpath('conceptlist.tsv'), str(test)) assert nattr(test, 'CONCEPTICON_GLOSS') == 0 _main('link', str(test)) assert nattr(test, 'CONCEPTICON_GLOSS') == 1 copy(fixturedir.joinpath('conceptlist2.tsv'), str(test)) _main('link', str(test)) out, err = capsys.readouterr() assert 'unknown CONCEPTICON_GLOSS' in out assert 'mismatch' in out
def test_link(self): from pyconcepticon.commands import link with self.assertRaises(ParserError): link(Mock(args=['.'], data=None)) def nattr(p, attr): return len(nfilter([getattr(i, attr, None) for i in read_all(p)])) test = self.tmp_path('test.tsv') copy(self.fixture_path('conceptlist.tsv'), test) self.assertEqual(nattr(test, 'CONCEPTICON_GLOSS'), 0) link(Mock(args=[test], data=None)) self.assertEqual(nattr(test, 'CONCEPTICON_GLOSS'), 1) copy(self.fixture_path('conceptlist2.tsv'), test) with capture(link, Mock(args=[test], data=None)) as out: self.assertIn('unknown CONCEPTICON_GLOSS', out) self.assertIn('mismatch', out)
def test_link(mocker, fixturedir, tmpdir, capsys): from pyconcepticon.commands import link with pytest.raises(ParserError): link(mocker.Mock(args=['.'], repos=None)) def nattr(p, attr): return len(nfilter([getattr(i, attr, None) for i in read_all(str(p))])) test = tmpdir.join('test.tsv') copy(fixturedir.joinpath('conceptlist.tsv'), str(test)) assert nattr(test, 'CONCEPTICON_GLOSS') == 0 link(mocker.Mock(args=[str(test)], repos=None)) assert nattr(test, 'CONCEPTICON_GLOSS') == 1 copy(fixturedir.joinpath('conceptlist2.tsv'), str(test)) link(mocker.Mock(args=[str(test)], repos=None)) out, err = capsys.readouterr() assert 'unknown CONCEPTICON_GLOSS' in out assert 'mismatch' in out
def test_ISO(tmppath): from clldutils.iso_639_3 import ISO, Code dated_zip = tmppath / '20121201.zip' copy(FIXTURES.joinpath('iso.zip'), dated_zip) iso = ISO(dated_zip) assert '{0}'.format(iso) == 'ISO 639-3 code tables from 2012-12-01' iso = ISO(FIXTURES.joinpath('iso.zip')) assert '{0}'.format(iso) == 'ISO 639-3 code tables from 2016-07-25' for attr in Code._type_map.values(): assert isinstance(getattr(iso, attr.lower()), list) assert len(iso.languages) == 7 assert len(iso.macrolanguages[0].extension) == 2 assert len(iso.languages[0].extension) == 0 assert len(iso.retirements[0].change_to) == 1 assert iso['auv'].change_to[0] in iso.languages d = {iso['auv']: 1} assert iso['auv'] in d assert '[twi]' in repr(sorted(iso.values(), reverse=True)[0]) assert '%s' % iso['aab'] == 'Alumu-Tesu [aab]'
def retrieve(self, item, cdstar_catalog, checksums, mediacatalog): """ - download - compute checksum - upload to CDSTAR - add to cdstar.json :return: Image instance """ md = self.metadata(item) or {} source_url = md.pop('source_url', None) if not source_url: return # We turn the Staged_images instance into a `dict`, which we will enrich and then # turn into an Images instance. item = dict(zip(item.fields(), item.csv_row())) with TemporaryDirectory() as tmp: if isinstance(source_url, Path): fname = tmp.joinpath(source_url.name) copy(source_url, fname) else: # download the thing fname = self._download(source_url, tmp) if not fname: return checksum = md5(fname) if checksum in checksums: raise ValueError('duplicate item {0} {1}'.format(item['id'], checksum)) item.update(md) item['id'] = checksum item['collection'] = 'Tsammalex' img = Images.fromdict(item) if checksum not in mediacatalog.items: # now upload to CDSTAR _, _, obj = list(cdstar_catalog.create(fname, item))[0] mediacatalog.add(obj) return img
def test_all(capsys, tmpdir, mocker, data): with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") md = str(tmpdir / 'md.json') copy(str(data / 'ds1.csv-metadata.json'), md) copy(str(data / 'ds1.bib'), str(tmpdir / 'ds1.bib')) copy(str(data / 'ds1.csv'), str(tmpdir / 'ds1.csv')) pdata = str(tmpdir / 'values.csv') copy(str(data / 'ds1.csv'), pdata) main(['validate', md]) out, err = capsys.readouterr() assert not out main(['stats', pdata]) out, err = capsys.readouterr() assert 'StructureDataset' in out main(['stats', md]) with pytest.raises(SystemExit): main(['createdb', md]) log = mocker.MagicMock() main(['createdb', md, str(tmpdir / 'test.sqlite')], log=log) assert log.info.called main(['dumpdb', md, str(tmpdir / 'test.sqlite')], log=log) uc = [ w_ for w_ in w if issubclass(w_.category, UserWarning) and str(w_.message).startswith('Unspecified column') ] assert uc with pytest.raises(SystemExit): main(['createdb', md, str(tmpdir / 'test.sqlite')], log=log)
def test_all(capsys, tmpdir, mocker, data): md = str(tmpdir / 'md.json') copy(str(data / 'ds1.csv-metadata.json'), md) copy(str(data / 'ds1.bib'), str(tmpdir / 'ds1.bib')) copy(str(data / 'ds1.csv'), str(tmpdir / 'ds1.csv')) pdata = str(tmpdir / 'values.csv') copy(str(data / 'ds1.csv'), pdata) validate(mocker.MagicMock(args=[md])) out, err = capsys.readouterr() assert not out stats(mocker.MagicMock(args=[pdata])) out, err = capsys.readouterr() assert 'StructureDataset' in out stats(mocker.MagicMock(args=[md])) with pytest.raises(ParserError): createdb(mocker.MagicMock(args=[md])) log = mocker.MagicMock() createdb(mocker.MagicMock(log=log, args=[md, str(tmpdir / 'test.sqlite')])) assert log.info.called
def urlretrieve(url, dest): copy(FIXTURES.joinpath('iso.zip'), dest)
def repos(tmpd): repos = tmpd / 'lexibank-data' copytree(Path(__file__).parent.joinpath('repos'), repos) copy( Path(pylexibank.__file__).parent.joinpath('cldf-metadata.json'), repos) yield repos