コード例 #1
0
ファイル: test_dataset.py プロジェクト: afcarl/pycldf
def test_Dataset_from_scratch(tmpdir, data):
    # An unknown file name cannot be used with Dataset.from_data:
    copy(str(data / 'ds1.csv'), str(tmpdir / 'xyz.csv'))
    with pytest.raises(ValueError):
        Dataset.from_data(str(tmpdir / 'xyz.csv'))

    # Known file name, but non-standard column name:
    write_text(str(tmpdir / 'values.csv'),
               "IDX,Language_ID,Parameter_ID,Value\n1,1,1,1")
    with pytest.raises(ValueError, match='missing columns'):
        ds = Dataset.from_data(str(tmpdir / 'values.csv'))

    # A known file name will determine the CLDF module of the dataset:
    copy(str(data / 'ds1.csv'), str(tmpdir / 'values.csv'))
    ds = Dataset.from_data(str(tmpdir / 'values.csv'))
    assert ds.module == 'StructureDataset'

    assert len(list(ds['ValueTable'])) == 2
    ds.validate()
    ds['ValueTable'].write(2 * list(ds['ValueTable']))
    with pytest.raises(ValueError):
        ds.validate()
    md = ds.write_metadata()
    Dataset.from_metadata(md)
    repr(ds)
    del ds.tablegroup.common_props['dc:conformsTo']
    Dataset.from_metadata(ds.write_metadata())
    assert len(ds.stats()) == 1
コード例 #2
0
ファイル: test_path.py プロジェクト: garnerargoed/clldutils
def test_copy(tmppath):
    from clldutils.path import copy

    src = make_file(tmppath, name='test', text='abc')
    dst = tmppath / 'other'
    copy(src, dst)
    assert src.stat().st_size == dst.stat().st_size
コード例 #3
0
ファイル: test_path.py プロジェクト: clld/clldutils
    def test_copy(self):
        from clldutils.path import copy

        src = self.make_file('test')
        dst = self.tmp_path('other')
        copy(src, dst)
        self.assertEquals(src.stat().st_size, dst.stat().st_size)
コード例 #4
0
def create_repos(dir_):
    tsammalexdata = dir_.join('tsammalexdata')
    tsammalexdata.mkdir()
    data = tsammalexdata.join('data')
    data.mkdir()

    with data.join('test.csv').open('w', encoding='utf8') as fp:
        fp.write("""\
a,b,c
1,2,3
4,5,6""")

    with data.join('distribution.csv').open('w', encoding='utf8') as fp:
        fp.write("id,coregions__ids,countries_ids")

    test_eco_path = fixture_path('test_ecoregions.json')
    eco_path = data.join('ecoregions.json')

    copy(Path(test_eco_path), Path(eco_path))

    external = data.join('external')
    external.mkdir()
    with external.join('test.csv').open('w', encoding='utf8') as fp:
        fp.write("""\
a,b,c
1,2,3
4,5,6""")
    external.join('gbif').mkdir()
    occurrences = fixture_path('abelmoschusesculentus.json')

    copy(Path(occurrences), Path(external.join('gbif', occurrences.name)))

    return dir_
コード例 #5
0
ファイル: test_dataset.py プロジェクト: glottobank/pycldf
def test_Dataset_from_scratch(tmpdir, data):
    # An unknown file name cannot be used with Dataset.from_data:
    copy(str(data / 'ds1.csv'), str(tmpdir / 'xyz.csv'))
    with pytest.raises(ValueError):
        Dataset.from_data(str(tmpdir / 'xyz.csv'))

    # Known file name, but non-standard column name:
    write_text(str(tmpdir / 'values.csv'), "IDX,Language_ID,Parameter_ID,Value\n1,1,1,1")
    with pytest.raises(ValueError, match='missing columns'):
        ds = Dataset.from_data(str(tmpdir / 'values.csv'))

    # A known file name will determine the CLDF module of the dataset:
    copy(str(data / 'ds1.csv'), str(tmpdir / 'values.csv'))
    ds = Dataset.from_data(str(tmpdir / 'values.csv'))
    assert ds.module == 'StructureDataset'

    assert len(list(ds['ValueTable'])) == 2
    ds.validate()
    ds['ValueTable'].write(2 * list(ds['ValueTable']))
    with pytest.raises(ValueError):
        ds.validate()
    md = ds.write_metadata()
    Dataset.from_metadata(md)
    repr(ds)
    del ds.tablegroup.common_props['dc:conformsTo']
    Dataset.from_metadata(ds.write_metadata())
    assert len(ds.stats()) == 1

    ds.add_table('extra.csv', 'ID')
    ds.write(**{'ValueTable': [], 'extra.csv': []})
    counts = {r[0]: r[2] for r in ds.stats()}
    assert counts['extra.csv'] == 0
コード例 #6
0
def test_Dataset_from_scratch(tmpdir, data):
    # An unknown file name cannot be used with Dataset.from_data:
    copy(str(data / 'ds1.csv'), str(tmpdir / 'xyz.csv'))
    with pytest.raises(ValueError):
        Dataset.from_data(str(tmpdir / 'xyz.csv'))

    # Known file name, but non-standard column name:
    Path(str(tmpdir / 'values.csv')).write_text(
        "IDX,Language_ID,Parameter_ID,Value\n1,1,1,1", encoding='utf-8')
    with pytest.raises(ValueError, match='missing columns'):
        ds = Dataset.from_data(str(tmpdir / 'values.csv'))

    # A known file name will determine the CLDF module of the dataset:
    copy(str(data / 'ds1.csv'), str(tmpdir / 'values.csv'))
    with warnings.catch_warnings(record=True):
        warnings.simplefilter("always")
        ds = Dataset.from_data(str(tmpdir / 'values.csv'))
        assert ds.module == 'StructureDataset'

        assert len(list(ds['ValueTable'])) == 2
        ds.validate()
        ds['ValueTable'].write(2 * list(ds['ValueTable']))
        with pytest.raises(ValueError):
            ds.validate()
        md = ds.write_metadata()
        Dataset.from_metadata(md)
        repr(ds)
        del ds.tablegroup.common_props['dc:conformsTo']
        Dataset.from_metadata(ds.write_metadata())
        assert len(ds.stats()) == 1

    ds.add_table('extra.csv', 'ID')
    ds.write(**{'ValueTable': [], 'extra.csv': []})
    counts = {r[0]: r[2] for r in ds.stats()}
    assert counts['extra.csv'] == 0
コード例 #7
0
def repos(tmppath, git_repo_factory):
    repos = tmppath / 'lexibank-data'
    copytree(Path(__file__).parent.joinpath('repos'), repos)
    git_repo_factory(repos)
    git_repo_factory(repos / 'datasets' / 'test_dataset')
    git_repo_factory(repos / 'datasets' / 'test_dataset_cldf')
    copy(Path(pylexibank.__file__).parent.joinpath('cldf-metadata.json'), repos)
    yield repos
コード例 #8
0
def test_check_new(fixturedir, capsys, mocker, tmpdir):
    from pyconcepticon.commands import check_new

    test = tmpdir.join('test.tsv')
    copy(fixturedir.joinpath('conceptlist2.tsv'), str(test))
    check_new(mocker.Mock(args=[str(test)], repos=None))
    out, err = capsys.readouterr()
    assert 'Gloss DUST' in out
コード例 #9
0
ファイル: util.py プロジェクト: marctang/lexibank-data-old
def download_and_unpack_zipfiles(url, dataset, *paths):
    """Download zipfiles and immediately unpack the content"""
    with TemporaryDirectory() as tmpdir:
        urlretrieve(url, tmpdir.joinpath('ds.zip').as_posix())
        with zipfile.ZipFile(tmpdir.joinpath('ds.zip').as_posix()) as zipf:
            for path in paths:
                zipf.extract(as_posix(path), path=tmpdir.as_posix())
                copy(tmpdir.joinpath(path), dataset.raw)
コード例 #10
0
ファイル: cldf.py プロジェクト: basagashka/pylexibank
    def __init__(self, dataset):
        self._count = defaultdict(int)
        self._cognate_count = defaultdict(int)
        self.dataset = dataset

        md = self.dataset.cldf_dir / MD_NAME
        if not md.exists():
            md = self.dataset.cldf_dir / ALT_MD_NAME
            if not md.exists():
                md = self.dataset.cldf_dir / MD_NAME
                copy(Path(__file__).parent / MD_NAME, md)
        self.wl = Wordlist.from_metadata(md)
        default_cldf = Wordlist.from_metadata(
            Path(__file__).parent / 'cldf-metadata.json')

        self.objects = {}
        self._obj_index = {}
        for cls in [
                self.dataset.lexeme_class,
                self.dataset.language_class,
                self.dataset.concept_class,
                self.dataset.cognate_class,
        ]:
            self.objects[cls.__cldf_table__()] = []
            self._obj_index[cls.__cldf_table__()] = set()

            cols = set(
                col.header
                for col in self.wl[cls.__cldf_table__()].tableSchema.columns)
            properties = set(
                col.propertyUrl.uri
                for col in self.wl[cls.__cldf_table__()].tableSchema.columns
                if col.propertyUrl)
            for field in cls.fieldnames():
                try:
                    col = default_cldf[cls.__cldf_table__(), field]
                    #
                    # We added Latitude and Longitude to the default metadata later, and want to
                    # make sure, existing datasets are upgraded silently.
                    #
                    if field in ['Latitude', 'Longitude'] \
                            and cls.__cldf_table__() == 'LanguageTable':
                        properties.add(col.propertyUrl.uri)
                        self.wl[cls.__cldf_table__(),
                                field].propertyUrl = col.propertyUrl
                        self.wl[cls.__cldf_table__(),
                                field].datatype = col.datatype
                except KeyError:
                    col = Column(name=field, datatype="string")
                if (col.propertyUrl and col.propertyUrl.uri not in properties) or \
                        ((not col.propertyUrl) and (field not in cols)):
                    self.wl[cls.__cldf_table__()].tableSchema.columns.append(
                        col)
コード例 #11
0
def test_check(api, capsys, mocker, tmpdir, _main):
    test = tmpdir.join('Sun-1991-1004.tsv')
    copy(api.repos.joinpath('concepticondata/conceptlists/Sun-1991-1004.tsv'), str(test))
    _main('check', str(test))
    out, err = capsys.readouterr()
    assert 'Sun-1991-1004-2 ' not in out
    assert 'fast (adv.)' in out

    t = test.read_text(encoding='utf8')
    test.write_text(t.replace('Sun-1991-1004-1', 'Sun-1991-1004-2'), encoding='utf8')
    _main('check', str(test))
    out, err = capsys.readouterr()
    print(out)
    assert 'Sun-1991-1004-2 ' in out
コード例 #12
0
    def test_link(self):
        from pyconcepticon.commands import link

        with self.assertRaises(ParserError):
            link(Mock(args=['.']))

        def nattr(p, attr):
            return len(nfilter([getattr(i, attr, None) for i in read_all(p)]))

        test = self.tmp_path('test.tsv')
        copy(Path(__file__).parent.joinpath('fixtures', 'conceptlist.tsv'), test)
        self.assertEqual(nattr(test, 'CONCEPTICON_GLOSS'), 0)
        link(Mock(args=[test]))
        self.assertEqual(nattr(test, 'CONCEPTICON_GLOSS'), 1)
コード例 #13
0
def test_check(fixturedir, capsys, mocker, tmpdir):
    from pyconcepticon.commands import check

    test = tmpdir.join('Sun-1991-1004.tsv')
    copy(fixturedir.joinpath('concepticondata/conceptlists/Sun-1991-1004.tsv'),
         str(test))
    check(mocker.Mock(args=str(test), repos=fixturedir))
    out, err = capsys.readouterr()
    assert '#1 FAST = "fast"' in out

    t = test.read_text(encoding='utf8')
    test.write_text(t.replace('1631', '111111'), encoding='utf8')
    check(mocker.Mock(args=str(test), repos=fixturedir))
    out, err = capsys.readouterr()
    assert '#1 FAST = "fast' in out
コード例 #14
0
    def download_and_unpack(self, url, *paths, **kw):
        """
        Download a zipfile and immediately unpack selected content.

        :param url:
        :param paths:
        :param kw:
        :return:
        """
        with self.temp_download(url, 'ds.zip', log=kw.pop('log',
                                                          None)) as zipp:
            with TemporaryDirectory() as tmpdir:
                with zipfile.ZipFile(zipp.as_posix()) as zipf:
                    for path in paths:
                        zipf.extract(as_posix(path), path=tmpdir.as_posix())
                        copy(tmpdir.joinpath(path), self)
コード例 #15
0
def test_validators(tmpdir, mocker, data):
    copy(str(data / 'invalid.csv'), str(tmpdir / 'values.csv'))
    ds = Dataset.from_data(str(tmpdir / 'values.csv'))

    with pytest.raises(ValueError):
        ds.validate()

    log = mocker.Mock()
    ds.validate(log=log)
    assert log.warn.call_count == 2

    for col in ds.tablegroup.tables[0].tableSchema.columns:
        if col.name == 'Language_ID':
            col.propertyUrl.uri = 'http://cldf.clld.org/v1.0/terms.rdf#glottocode'

    log = mocker.Mock()
    ds.validate(log=log)
    assert log.warn.call_count == 4
コード例 #16
0
def test_link(fixturedir, tmpdir, capsys, _main):
    with pytest.raises(SystemExit):
        _main('link', '.')

    def nattr(p, attr):
        return len(nfilter([getattr(i, attr, None) for i in read_all(str(p))]))

    test = tmpdir.join('test.tsv')
    copy(fixturedir.joinpath('conceptlist.tsv'), str(test))
    assert nattr(test, 'CONCEPTICON_GLOSS') == 0
    _main('link', str(test))
    assert nattr(test, 'CONCEPTICON_GLOSS') == 1

    copy(fixturedir.joinpath('conceptlist2.tsv'), str(test))
    _main('link', str(test))
    out, err = capsys.readouterr()
    assert 'unknown CONCEPTICON_GLOSS' in out
    assert 'mismatch' in out
コード例 #17
0
ファイル: test_dataset.py プロジェクト: glottobank/pycldf
def test_validators(tmpdir, mocker, data):
    copy(str(data / 'invalid.csv'), str(tmpdir / 'values.csv'))
    ds = Dataset.from_data(str(tmpdir / 'values.csv'))

    with pytest.raises(ValueError):
        ds.validate()

    log = mocker.Mock()
    ds.validate(log=log)
    assert log.warn.call_count == 2

    for col in ds.tablegroup.tables[0].tableSchema.columns:
        if col.name == 'Language_ID':
            col.propertyUrl.uri = 'http://cldf.clld.org/v1.0/terms.rdf#glottocode'

    log = mocker.Mock()
    ds.validate(log=log)
    assert log.warn.call_count == 4
コード例 #18
0
    def test_link(self):
        from pyconcepticon.commands import link

        with self.assertRaises(ParserError):
            link(Mock(args=['.'], data=None))

        def nattr(p, attr):
            return len(nfilter([getattr(i, attr, None) for i in read_all(p)]))

        test = self.tmp_path('test.tsv')
        copy(self.fixture_path('conceptlist.tsv'), test)
        self.assertEqual(nattr(test, 'CONCEPTICON_GLOSS'), 0)
        link(Mock(args=[test], data=None))
        self.assertEqual(nattr(test, 'CONCEPTICON_GLOSS'), 1)

        copy(self.fixture_path('conceptlist2.tsv'), test)
        with capture(link, Mock(args=[test], data=None)) as out:
            self.assertIn('unknown CONCEPTICON_GLOSS', out)
            self.assertIn('mismatch', out)
コード例 #19
0
def test_link(mocker, fixturedir, tmpdir, capsys):
    from pyconcepticon.commands import link

    with pytest.raises(ParserError):
        link(mocker.Mock(args=['.'], repos=None))

    def nattr(p, attr):
        return len(nfilter([getattr(i, attr, None) for i in read_all(str(p))]))

    test = tmpdir.join('test.tsv')
    copy(fixturedir.joinpath('conceptlist.tsv'), str(test))
    assert nattr(test, 'CONCEPTICON_GLOSS') == 0
    link(mocker.Mock(args=[str(test)], repos=None))
    assert nattr(test, 'CONCEPTICON_GLOSS') == 1

    copy(fixturedir.joinpath('conceptlist2.tsv'), str(test))
    link(mocker.Mock(args=[str(test)], repos=None))
    out, err = capsys.readouterr()
    assert 'unknown CONCEPTICON_GLOSS' in out
    assert 'mismatch' in out
コード例 #20
0
def test_ISO(tmppath):
    from clldutils.iso_639_3 import ISO, Code

    dated_zip = tmppath / '20121201.zip'
    copy(FIXTURES.joinpath('iso.zip'), dated_zip)
    iso = ISO(dated_zip)
    assert '{0}'.format(iso) == 'ISO 639-3 code tables from 2012-12-01'

    iso = ISO(FIXTURES.joinpath('iso.zip'))
    assert '{0}'.format(iso) == 'ISO 639-3 code tables from 2016-07-25'
    for attr in Code._type_map.values():
        assert isinstance(getattr(iso, attr.lower()), list)

    assert len(iso.languages) == 7
    assert len(iso.macrolanguages[0].extension) == 2
    assert len(iso.languages[0].extension) == 0
    assert len(iso.retirements[0].change_to) == 1
    assert iso['auv'].change_to[0] in iso.languages
    d = {iso['auv']: 1}
    assert iso['auv'] in d
    assert '[twi]' in repr(sorted(iso.values(), reverse=True)[0])
    assert '%s' % iso['aab'] == 'Alumu-Tesu [aab]'
コード例 #21
0
    def retrieve(self, item, cdstar_catalog, checksums, mediacatalog):
        """
        - download
        - compute checksum
        - upload to CDSTAR
        - add to cdstar.json

        :return: Image instance
        """
        md = self.metadata(item) or {}
        source_url = md.pop('source_url', None)
        if not source_url:
            return
        # We turn the Staged_images instance into a `dict`, which we will enrich and then
        # turn into an Images instance.
        item = dict(zip(item.fields(), item.csv_row()))
        with TemporaryDirectory() as tmp:
            if isinstance(source_url, Path):
                fname = tmp.joinpath(source_url.name)
                copy(source_url, fname)
            else:
                # download the thing
                fname = self._download(source_url, tmp)
                if not fname:
                    return
            checksum = md5(fname)
            if checksum in checksums:
                raise ValueError('duplicate item {0} {1}'.format(item['id'], checksum))
            item.update(md)
            item['id'] = checksum
            item['collection'] = 'Tsammalex'
            img = Images.fromdict(item)
            if checksum not in mediacatalog.items:
                # now upload to CDSTAR
                _, _, obj = list(cdstar_catalog.create(fname, item))[0]
                mediacatalog.add(obj)
            return img
コード例 #22
0
def test_all(capsys, tmpdir, mocker, data):
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("always")

        md = str(tmpdir / 'md.json')
        copy(str(data / 'ds1.csv-metadata.json'), md)
        copy(str(data / 'ds1.bib'), str(tmpdir / 'ds1.bib'))
        copy(str(data / 'ds1.csv'), str(tmpdir / 'ds1.csv'))
        pdata = str(tmpdir / 'values.csv')
        copy(str(data / 'ds1.csv'), pdata)

        main(['validate', md])
        out, err = capsys.readouterr()
        assert not out

        main(['stats', pdata])
        out, err = capsys.readouterr()
        assert 'StructureDataset' in out

        main(['stats', md])

        with pytest.raises(SystemExit):
            main(['createdb', md])

        log = mocker.MagicMock()
        main(['createdb', md, str(tmpdir / 'test.sqlite')], log=log)
        assert log.info.called
        main(['dumpdb', md, str(tmpdir / 'test.sqlite')], log=log)

        uc = [
            w_ for w_ in w if issubclass(w_.category, UserWarning)
            and str(w_.message).startswith('Unspecified column')
        ]
        assert uc

    with pytest.raises(SystemExit):
        main(['createdb', md, str(tmpdir / 'test.sqlite')], log=log)
コード例 #23
0
ファイル: test_cli.py プロジェクト: afcarl/pycldf
def test_all(capsys, tmpdir, mocker, data):
    md = str(tmpdir / 'md.json')
    copy(str(data / 'ds1.csv-metadata.json'), md)
    copy(str(data / 'ds1.bib'), str(tmpdir / 'ds1.bib'))
    copy(str(data / 'ds1.csv'), str(tmpdir / 'ds1.csv'))
    pdata = str(tmpdir / 'values.csv')
    copy(str(data / 'ds1.csv'), pdata)

    validate(mocker.MagicMock(args=[md]))
    out, err = capsys.readouterr()
    assert not out

    stats(mocker.MagicMock(args=[pdata]))
    out, err = capsys.readouterr()
    assert 'StructureDataset' in out

    stats(mocker.MagicMock(args=[md]))

    with pytest.raises(ParserError):
        createdb(mocker.MagicMock(args=[md]))

    log = mocker.MagicMock()
    createdb(mocker.MagicMock(log=log, args=[md, str(tmpdir / 'test.sqlite')]))
    assert log.info.called
コード例 #24
0
ファイル: test_iso_639_3.py プロジェクト: clld/clldutils
 def urlretrieve(url, dest):
     copy(FIXTURES.joinpath('iso.zip'), dest)
コード例 #25
0
ファイル: conftest.py プロジェクト: basagashka/pylexibank
def repos(tmpd):
    repos = tmpd / 'lexibank-data'
    copytree(Path(__file__).parent.joinpath('repos'), repos)
    copy(
        Path(pylexibank.__file__).parent.joinpath('cldf-metadata.json'), repos)
    yield repos
コード例 #26
0
 def urlretrieve(url, dest):
     copy(FIXTURES.joinpath('iso.zip'), dest)