コード例 #1
0
ファイル: test_dataset.py プロジェクト: LinguList/pycldf
    def test_dataset_from_file(self):
        from pycldf.dataset import Dataset

        ds = Dataset.from_file(FIXTURES.joinpath('ds1.csv'))
        self.assertIn('ds1', repr(ds))
        self.assertEqual(len(ds), 2)
        self.assertEqual(ds.table.url, 'ds1.csv')
        self.assertEqual(ds.metadata['dc:creator'], 'The Author')

        row = ['3', 'abcd1234', 'fid2', 'maybe', '', 'new[4]']
        with self.assertRaises(ValueError):
            ds.add_row(row)

        ds.sources.add('@book{new,\nauthor={new author}}')
        res = ds.add_row(row)
        self.assertEqual(res.url, 'http://example.org/valuesets/3')
        self.assertEqual(len(res.refs), 1)
        self.assertEqual(
            res.valueUrl('Language_ID'),
            'http://glottolog.org/resource/languoid/id/abcd1234')
        res = ds.add_row(['4', None, None, None, None, None])
        self.assertEqual(res.valueUrl('Language_ID'), None)
        out = self.tmp_path()
        ds.write(out, '.tsv')
        self.assertTrue(out.joinpath('ds1.bib').exists())
        md = load(out.joinpath('ds1.tsv-metadata.json'))
        self.assertEqual('ds1.tsv', md['tables'][0]['url'])
        Dataset.from_file(out.joinpath('ds1.tsv'))
コード例 #2
0
ファイル: cli.py プロジェクト: LinguList/pycldf
def stats(args):
    """
    cldf stats <DATASET>

    Print basic stats for CLDF dataset <DATASET>, where <DATASET> may be the path to
    - a CLDF metadata file
    - a CLDF core data file
    - a CLDF zip archive
    """
    if len(args.args) < 1:
        raise ParserError('not enough arguments')
    fname = Path(args.args[0])
    if not fname.exists() or not fname.is_file():
        raise ParserError('%s is not an existing directory' % fname)
    if fname.suffix == '.zip':
        ds = Dataset.from_zip(fname)
    elif fname.name.endswith(MD_SUFFIX):
        ds = Dataset.from_metadata(fname)
    else:
        ds = Dataset.from_file(fname)
    print(fname)
    stats_ = ds.stats
    print("""
Name: %s
Different languages: %s
Different parameters: %s
Rows: %s
""" % (
        ds.name,
        len(stats_['languages']),
        len(stats_['parameters']),
        stats_['rowcount']
    ))
コード例 #3
0
ファイル: test_dataset.py プロジェクト: LinguList/pycldf
    def test_write_read_archive(self):
        from pycldf.dataset import Dataset
        from pycldf.util import Archive

        ds = Dataset.from_file(FIXTURES.joinpath('ds1.csv'))
        out = self.tmp_path()

        with self.assertRaises(ValueError):
            ds.write(out.joinpath('non-existing'), '.tsv', archive=True)

        with Archive(self.tmp_path('archive.zip').as_posix(), 'w') as archive:
            ds.write('.', archive=archive)
            ds2 = Dataset.from_file(FIXTURES.joinpath('ds1.csv'))
            ds2.name = 'new_name'
            ds2.write('.', archive=archive)
        ds_out = Dataset.from_zip(self.tmp_path('archive.zip'), name='ds1')
        self.assertEqual(ds.rows, ds_out.rows)
        self.assertEqual(ds.metadata, ds_out.metadata)

        with Archive(self.tmp_path('archive.zip')) as archive:
            ds_out = Dataset.from_metadata('ds1.csv-metadata.json', container=archive)
            self.assertEqual(ds.rows, ds_out.rows)
            self.assertEqual(ds.metadata, ds_out.metadata)

        ds.write(out, '.tsv', archive=True)
        ds_out = Dataset.from_zip(out.joinpath('ds1.zip'))
        self.assertEqual(ds.rows, ds_out.rows)
        self.assertEqual(ds.metadata, ds_out.metadata)
コード例 #4
0
ファイル: util.py プロジェクト: LinguList/pycldf
def make_dataset(name='test', fields=None, rows=None):
    ds = Dataset(name)
    ds.fields = fields or tuple(f[0] for f in REQUIRED_FIELDS)
    if rows:
        for row in rows:
            ds.add_row(row)
    return ds
コード例 #5
0
ファイル: test_dataset.py プロジェクト: LinguList/pycldf
    def test_dataset_from_metadata(self):
        from pycldf.dataset import Dataset

        ds = Dataset.from_metadata(FIXTURES.joinpath('ds1.csv-metadata.json'))
        self.assertIn('ds1', repr(ds))

        with self.assertRaises(ValueError):
            Dataset.from_metadata(FIXTURES.joinpath('ds1.csv-me.json'))
コード例 #6
0
ファイル: __main__.py プロジェクト: glottobank/pycldf
def _get_dataset(args):
    if len(args.args) < 1:
        raise ParserError('not enough arguments')
    fname = Path(args.args[0])
    if not fname.exists() or not fname.is_file():
        raise ParserError('%s is not an existing directory' % fname)
    if fname.suffix == '.json':
        return Dataset.from_metadata(fname)
    return Dataset.from_data(fname)
コード例 #7
0
ファイル: test_dataset.py プロジェクト: LinguList/pycldf
    def test_invalid_dataset_from_file(self):
        from pycldf.dataset import Dataset

        log = Mock(warn=Mock())
        with patch('pycldf.dataset.log', log):
            Dataset.from_file(FIXTURES.joinpath('invalid.csv'), skip_on_error=True)
            self.assertEqual(log.warn.call_count, 2)

        with self.assertRaises(ValueError):
            Dataset.from_file(FIXTURES.joinpath('invalid.csv'))
コード例 #8
0
ファイル: test_dataset.py プロジェクト: glottobank/pycldf
def test_duplicate_component(ds, tmpdir):
    # adding a component twice is not possible:
    ds.add_component('ValueTable')
    with pytest.raises(ValueError):
        ds.add_component('ValueTable')

    # JSON descriptions with duplicate components cannot be read:
    md = tmpdir / 'md.json'
    json = """\
{
    "@context": ["http://www.w3.org/ns/csvw", {"@language": "en"}],
    "dc:conformsTo": "http://cldf.clld.org/v1.0/terms.rdf#StructureDataset",
    "tables": [
        {"url": "values.csv"},
        COMPS 
    ]
}"""
    comp = """
{
    "url": "values.csv",
    "dc:conformsTo": "http://cldf.clld.org/v1.0/terms.rdf#ValueTable",
    "tableSchema": {
        "columns": [
            {
                "name": "ID",
                "propertyUrl": "http://cldf.clld.org/v1.0/terms.rdf#id"
            },
            {
                "name": "Language_ID",
                "propertyUrl": "http://cldf.clld.org/v1.0/terms.rdf#languageReference"
            },
            {
                "name": "Parameter_ID",
                "propertyUrl": "http://cldf.clld.org/v1.0/terms.rdf#parameterReference"
            },
            {
                "name": "Value",
                "propertyUrl": "http://cldf.clld.org/v1.0/terms.rdf#value"
            }
        ]
    }
}"""
    md.write_text(json.replace('COMPS', comp), encoding='utf8')
    (tmpdir / 'values.csv').write_text(
        "ID,Language_ID,Parameter_ID,Value\n1,1,1,1", encoding='utf8')
    ds = Dataset.from_metadata(str(md))
    assert ds.validate()

    md.write_text(json.replace('COMPS', ', '.join([comp, comp])), encoding='utf8')
    with pytest.raises(ValueError) as excinfo:
        Dataset.from_metadata(str(md))
    assert 'duplicate component' in excinfo.exconly()
コード例 #9
0
ファイル: test_dataset.py プロジェクト: glottobank/pycldf
def test_Dataset_from_scratch(tmpdir, data):
    # An unknown file name cannot be used with Dataset.from_data:
    copy(str(data / 'ds1.csv'), str(tmpdir / 'xyz.csv'))
    with pytest.raises(ValueError):
        Dataset.from_data(str(tmpdir / 'xyz.csv'))

    # Known file name, but non-standard column name:
    write_text(str(tmpdir / 'values.csv'), "IDX,Language_ID,Parameter_ID,Value\n1,1,1,1")
    with pytest.raises(ValueError, match='missing columns'):
        ds = Dataset.from_data(str(tmpdir / 'values.csv'))

    # A known file name will determine the CLDF module of the dataset:
    copy(str(data / 'ds1.csv'), str(tmpdir / 'values.csv'))
    ds = Dataset.from_data(str(tmpdir / 'values.csv'))
    assert ds.module == 'StructureDataset'

    assert len(list(ds['ValueTable'])) == 2
    ds.validate()
    ds['ValueTable'].write(2 * list(ds['ValueTable']))
    with pytest.raises(ValueError):
        ds.validate()
    md = ds.write_metadata()
    Dataset.from_metadata(md)
    repr(ds)
    del ds.tablegroup.common_props['dc:conformsTo']
    Dataset.from_metadata(ds.write_metadata())
    assert len(ds.stats()) == 1

    ds.add_table('extra.csv', 'ID')
    ds.write(**{'ValueTable': [], 'extra.csv': []})
    counts = {r[0]: r[2] for r in ds.stats()}
    assert counts['extra.csv'] == 0
コード例 #10
0
ファイル: test_dataset.py プロジェクト: LinguList/pycldf
    def test_validate(self):
        from pycldf.dataset import Dataset, REQUIRED_FIELDS

        ds = Dataset('name')
        with self.assertRaises(AssertionError):  # missing required fields!
            ds.fields = ('a',)

        with self.assertRaises(AssertionError):  # fields must be tuple
            ds.fields = [variants[-1] for variants in REQUIRED_FIELDS]

        ds.fields = tuple(variants[-1] for variants in REQUIRED_FIELDS)

        with self.assertRaises(ValueError):  # fields cannot be reassigned!
            ds.fields = tuple(variants[0] for variants in REQUIRED_FIELDS)
コード例 #11
0
ファイル: test_dataset.py プロジェクト: LinguList/pycldf
    def test_write_read(self):
        from pycldf.dataset import Dataset, REQUIRED_FIELDS

        row = ['1', 'abcd1234', 'fid', 'yes']
        ds = Dataset('name')
        ds.fields = tuple(v[0] for v in REQUIRED_FIELDS)
        ds.add_row(row)
        ds.write(self.tmp_path())
        self.assertTrue(self.tmp_path('name.csv').exists())
        ds2 = Dataset.from_file(self.tmp_path('name.csv'))
        self.assertEqual(list(ds2[0].values()), row)
        self.assertEqual(list(ds2['1'].values()), row)
コード例 #12
0
ファイル: test_db.py プロジェクト: glottobank/pycldf
def test_db_write(tmpdir, data):
    #import shutil
    ds = Dataset.from_metadata(data / 'ds1.csv-metadata.json')
    db = Database(ds, fname=str(tmpdir.join('db.sqlite')))
    db.write_from_tg()
    #shutil.copy(str(tmpdir.join('db.sqlite')), 'db.sqlite')
    assert len(db.query("select * from ValueTable where cldf_parameterReference = 'fid1'")) == 1
    assert len(db.query('select * from SourceTable')) == 2
    assert len(db.query("select * from ValueTable_SourceTable where context = '2-5'")) == 1

    assert db.read()['ValueTable'][0]['cldf_source'] == ['80086', 'meier2015[2-5]']
    db.to_cldf(str(tmpdir.join('cldf')))
    assert tmpdir.join('cldf', 'ds1.bib').check()
    assert '80086;meier2015[2-5]' in tmpdir.join('cldf', 'ds1.csv').read_text('utf8')
コード例 #13
0
    def test_CldfDownload(self):
        from clld.web.adapters.cldf import CldfDownload

        tmp = self.tmp_path('dl.zip')
        dl = CldfDownload(Dataset, 'clld')
        dl.create(self.env['request'], verbose=False, outfile=tmp)
        ds = CldfDataset.from_zip(tmp)
        self.assertEqual(ds.name, 'dataset-contribution-contribution')
        self.assertEqual(
            'http://localhost/values/{ID}',
            ds.table.schema.aboutUrl)
        self.assertEqual(
            'http://localhost/languages/{Language_ID}',
            ds.table.schema.columns['Language_ID'].valueUrl)
        self.assertEqual(len(ds.rows), 3)
        self.assertIn('Language_glottocode', ds[0])
        self.assertIn('10-20', ds['value2']['Source'])
コード例 #14
0
ファイル: test_dataset.py プロジェクト: glottobank/pycldf
def test_validators(tmpdir, mocker, data):
    copy(str(data / 'invalid.csv'), str(tmpdir / 'values.csv'))
    ds = Dataset.from_data(str(tmpdir / 'values.csv'))

    with pytest.raises(ValueError):
        ds.validate()

    log = mocker.Mock()
    ds.validate(log=log)
    assert log.warn.call_count == 2

    for col in ds.tablegroup.tables[0].tableSchema.columns:
        if col.name == 'Language_ID':
            col.propertyUrl.uri = 'http://cldf.clld.org/v1.0/terms.rdf#glottocode'

    log = mocker.Mock()
    ds.validate(log=log)
    assert log.warn.call_count == 4
コード例 #15
0
ファイル: test_web_adapters_cldf.py プロジェクト: clld/clld
def test_CldfDownload(env, tmppath, mocker, capsys):
    from clld.web.adapters.cldf import CldfDownload

    mocker.patch('clld.web.adapters.cldf.transaction')
    tmp = tmppath / 'dl.zip'
    dl = CldfDownload(Dataset, 'clld')
    dl.create(env['request'], outfile=tmp, verbose=True)
    out, err = capsys.readouterr()
    assert 'Value' in out

    outdir = tmppath / 'cldf'
    with ZipFile(tmp.as_posix()) as zip:
        assert 'Wordlist-metadata.json' in zip.namelist()
        zip.extractall(str(outdir))

    ds = CldfDataset.from_metadata(outdir.joinpath('Wordlist-metadata.json'))
    assert ds.module == 'Wordlist'
    values = list(ds[ds.primary_table])
    assert len(values) == 3
    for v in values:
        list(ds.sources.expand_refs(v['Source']))
コード例 #16
0
ファイル: util.py プロジェクト: Anaphory/lexibank
def import_cldf(srcdir, md, languoids, conceptsets):
    with transaction.manager:
        contrib = Provider(
            id=srcdir.name,
            name=md['dc:title'],
            description=md.get('dc:bibliographicCitation'),
            url=md.get('dc:identifier'),
            license=md.get('dc:license'),
            aboutUrl=md.get('aboutUrl'),
        )
        DBSession.add(contrib)
        sources = {}
        cldfdir = srcdir.joinpath('cldf')
        values = Data()
        for fname in tqdm(list(cldfdir.glob('*' + MD_SUFFIX)), leave=False):
            ds = Dataset.from_metadata(fname)
            for src in ds.sources.items():
                if src.id not in sources:
                    sources[src.id] = cldf2clld(src, contrib, len(sources) + 1)
            import_dataset(ds, contrib, languoids, conceptsets, sources, values)
            DBSession.flush()
        # import cognates:
        if cldfdir.joinpath('cognates.csv').exists():
            for csid, cognates in groupby(
                    reader(cldfdir.joinpath('cognates.csv'), dicts=True),
                    lambda i: i['Cognate_set_ID']):
                cs = Cognateset(id=unique_id(contrib, csid), contribution=contrib)
                for cognate in cognates:
                    cp = values['Counterpart'].get(cognate['Word_ID'])
                    if cp:
                        DBSession.add(CognatesetCounterpart(
                            cognateset=cs,
                            counterpart=cp,
                            cognate_detection_method=cognate['Cognate_detection_method'],
                            alignment=cognate['Alignment'],
                            alignment_method=cognate['Alignment_method'],
                            doubt=cognate['Doubt'] == 'True'))
コード例 #17
0
    def dataset(self, req):
        ds = Dataset('%s-%s-%s' % (
            req.dataset.id, self.obj.__class__.__name__.lower(), self.obj.id))
        cols = self.columns(req)
        ds.fields = tuple(col['name'] if isinstance(col, dict) else col for col in cols)
        ds.table.schema.aboutUrl = url_template(req, 'value', 'ID')

        for col in cols:
            if isinstance(col, dict):
                name = col.pop('name')
                for attr, value in col.items():
                    setattr(ds.table.schema.columns[name], attr, value)

        ds.metadata['dc:bibliographicCitation '] = text_citation(req, self.obj)
        ds.metadata['dc:publisher'] = '%s, %s' % (
            req.dataset.publisher_name, req.dataset.publisher_place)
        ds.metadata['dc:license'] = req.dataset.license
        ds.metadata['dc:issued'] = req.dataset.published.isoformat()
        ds.metadata['dc:title'] = self.obj.name
        ds.metadata['dc:creator'] = self.obj.formatted_contributors()
        ds.metadata['dc:identifier'] = req.resource_url(self.obj)
        ds.metadata['dc:isPartOf'] = req.resource_url(req.dataset)
        ds.metadata['dcat:accessURL'] = req.route_url('download')

        for value in self.value_query():
            refs, sources = self.refs_and_sources(req, value)
            row = self.row(req, value, refs)
            if row:
                ds.sources.add(*sources)
                ds.add_row(row)
        return ds
コード例 #18
0
ファイル: test_dataset.py プロジェクト: glottobank/pycldf
def test_Dataset_from_data_empty_file(tmpdir):
    write_text(str(tmpdir / 'values.csv'), '')
    with pytest.raises(ValueError, match='empty data file'):
        Dataset.from_data(str(tmpdir / 'values.csv'))