Exemple #1
0
    def test_write_read_archive(self):
        from pycldf.dataset import Dataset
        from pycldf.util import Archive

        ds = Dataset.from_file(FIXTURES.joinpath('ds1.csv'))
        out = self.tmp_path()

        with self.assertRaises(ValueError):
            ds.write(out.joinpath('non-existing'), '.tsv', archive=True)

        with Archive(self.tmp_path('archive.zip').as_posix(), 'w') as archive:
            ds.write('.', archive=archive)
            ds2 = Dataset.from_file(FIXTURES.joinpath('ds1.csv'))
            ds2.name = 'new_name'
            ds2.write('.', archive=archive)
        ds_out = Dataset.from_zip(self.tmp_path('archive.zip'), name='ds1')
        self.assertEqual(ds.rows, ds_out.rows)
        self.assertEqual(ds.metadata, ds_out.metadata)

        with Archive(self.tmp_path('archive.zip')) as archive:
            ds_out = Dataset.from_metadata('ds1.csv-metadata.json', container=archive)
            self.assertEqual(ds.rows, ds_out.rows)
            self.assertEqual(ds.metadata, ds_out.metadata)

        ds.write(out, '.tsv', archive=True)
        ds_out = Dataset.from_zip(out.joinpath('ds1.zip'))
        self.assertEqual(ds.rows, ds_out.rows)
        self.assertEqual(ds.metadata, ds_out.metadata)
Exemple #2
0
def stats(args):
    """
    cldf stats <DATASET>

    Print basic stats for CLDF dataset <DATASET>, where <DATASET> may be the path to
    - a CLDF metadata file
    - a CLDF core data file
    - a CLDF zip archive
    """
    if len(args.args) < 1:
        raise ParserError('not enough arguments')
    fname = Path(args.args[0])
    if not fname.exists() or not fname.is_file():
        raise ParserError('%s is not an existing directory' % fname)
    if fname.suffix == '.zip':
        ds = Dataset.from_zip(fname)
    elif fname.name.endswith(MD_SUFFIX):
        ds = Dataset.from_metadata(fname)
    else:
        ds = Dataset.from_file(fname)
    print(fname)
    stats_ = ds.stats
    print("""
Name: %s
Different languages: %s
Different parameters: %s
Rows: %s
""" % (
        ds.name,
        len(stats_['languages']),
        len(stats_['parameters']),
        stats_['rowcount']
    ))
Exemple #3
0
def get_dataset(fname):
    """Load a CLDF dataset.

    Load the file as `json` CLDF metadata description file, or as metadata-free
    dataset contained in a single csv file.

    The distinction is made depending on the file extension: `.json` files are
    loaded as metadata descriptions, all other files are matched against the
    CLDF module specifications. Directories are checked for the presence of
    any CLDF datasets in undefined order of the dataset types.

    Parameters
    ----------
    fname : str or Path
        Path to a CLDF dataset

    Returns
    -------
    Dataset
    """
    fname = Path(fname)
    if not fname.exists():
        raise FileNotFoundError(
            '{:} does not exist'.format(fname))
    if fname.suffix == '.json':
        return Dataset.from_metadata(fname)
    return Dataset.from_data(fname)
Exemple #4
0
def make_dataset(name='test', fields=None, rows=None):
    ds = Dataset(name)
    ds.fields = fields or tuple(f[0] for f in REQUIRED_FIELDS)
    if rows:
        for row in rows:
            ds.add_row(row)
    return ds
Exemple #5
0
    def test_dataset_from_file(self):
        from pycldf.dataset import Dataset

        ds = Dataset.from_file(FIXTURES.joinpath('ds1.csv'))
        self.assertIn('ds1', repr(ds))
        self.assertEqual(len(ds), 2)
        self.assertEqual(ds.table.url, 'ds1.csv')
        self.assertEqual(ds.metadata['dc:creator'], 'The Author')

        row = ['3', 'abcd1234', 'fid2', 'maybe', '', 'new[4]']
        with self.assertRaises(ValueError):
            ds.add_row(row)

        ds.sources.add('@book{new,\nauthor={new author}}')
        res = ds.add_row(row)
        self.assertEqual(res.url, 'http://example.org/valuesets/3')
        self.assertEqual(len(res.refs), 1)
        self.assertEqual(
            res.valueUrl('Language_ID'),
            'http://glottolog.org/resource/languoid/id/abcd1234')
        res = ds.add_row(['4', None, None, None, None, None])
        self.assertEqual(res.valueUrl('Language_ID'), None)
        out = self.tmp_path()
        ds.write(out, '.tsv')
        self.assertTrue(out.joinpath('ds1.bib').exists())
        md = load(out.joinpath('ds1.tsv-metadata.json'))
        self.assertEqual('ds1.tsv', md['tables'][0]['url'])
        Dataset.from_file(out.joinpath('ds1.tsv'))
Exemple #6
0
    def test_dataset_from_metadata(self):
        from pycldf.dataset import Dataset

        ds = Dataset.from_metadata(FIXTURES.joinpath('ds1.csv-metadata.json'))
        self.assertIn('ds1', repr(ds))

        with self.assertRaises(ValueError):
            Dataset.from_metadata(FIXTURES.joinpath('ds1.csv-me.json'))
Exemple #7
0
def test_duplicate_component(ds, tmpdir):
    # adding a component twice is not possible:
    t = ds.add_component('ValueTable')
    t.url = Link('other.csv')
    with pytest.raises(ValueError):
        ds.add_component('ValueTable')

    # JSON descriptions with duplicate components cannot be read:
    md = tmpdir / 'md.json'
    json = """\
{
    "@context": ["http://www.w3.org/ns/csvw", {"@language": "en"}],
    "dc:conformsTo": "http://cldf.clld.org/v1.0/terms.rdf#StructureDataset",
    "tables": [
        {"url": "values.csv"},
        COMPS 
    ]
}"""
    comp = """
{
    "url": "values.csv",
    "dc:conformsTo": "http://cldf.clld.org/v1.0/terms.rdf#ValueTable",
    "tableSchema": {
        "columns": [
            {
                "name": "ID",
                "propertyUrl": "http://cldf.clld.org/v1.0/terms.rdf#id"
            },
            {
                "name": "Language_ID",
                "propertyUrl": "http://cldf.clld.org/v1.0/terms.rdf#languageReference"
            },
            {
                "name": "Parameter_ID",
                "propertyUrl": "http://cldf.clld.org/v1.0/terms.rdf#parameterReference"
            },
            {
                "name": "Value",
                "propertyUrl": "http://cldf.clld.org/v1.0/terms.rdf#value"
            }
        ]
    }
}"""

    with warnings.catch_warnings(record=True):
        warnings.simplefilter("always")

        md.write_text(json.replace('COMPS', comp), encoding='utf8')
        (tmpdir / 'values.csv').write_text(
            "ID,Language_ID,Parameter_ID,Value\n1,1,1,1", encoding='utf8')
        ds = Dataset.from_metadata(str(md))
        assert ds.validate()

        md.write_text(json.replace('COMPS', ', '.join([comp, comp])),
                      encoding='utf8')
        with pytest.raises(ValueError) as excinfo:
            Dataset.from_metadata(str(md))
        assert 'duplicate component' in excinfo.exconly()
Exemple #8
0
def _get_dataset(args):
    if len(args.args) < 1:
        raise ParserError('not enough arguments')
    fname = Path(args.args[0])
    if not fname.exists() or not fname.is_file():
        raise ParserError('%s is not an existing directory' % fname)
    if fname.suffix == '.json':
        return Dataset.from_metadata(fname)
    return Dataset.from_data(fname)
Exemple #9
0
def _get_dataset(args):
    if len(args.args) < 1:
        raise ParserError('not enough arguments')
    fname = Path(args.args[0])
    if not fname.exists() or not fname.is_file():
        raise ParserError('%s is not an existing directory' % fname)
    if fname.suffix == '.json':
        return Dataset.from_metadata(fname)
    return Dataset.from_data(fname)
Exemple #10
0
    def test_invalid_dataset_from_file(self):
        from pycldf.dataset import Dataset

        log = Mock(warn=Mock())
        with patch('pycldf.dataset.log', log):
            Dataset.from_file(FIXTURES.joinpath('invalid.csv'), skip_on_error=True)
            self.assertEqual(log.warn.call_count, 2)

        with self.assertRaises(ValueError):
            Dataset.from_file(FIXTURES.joinpath('invalid.csv'))
Exemple #11
0
def test_modules(tmpdir):
    ds = Dataset(_make_tg(tmpdir))
    assert ds.primary_table is None
    ds = Dataset(_make_tg(tmpdir, {"url": "data.csv"}))
    assert ds.primary_table is None
    ds = Dataset(_make_tg(tmpdir, {
        "url": "data.csv",
        "dc:conformsTo": "http://cldf.clld.org/v1.0/terms.rdf#ValueTable"}))
    assert ds.primary_table == 'ValueTable'
    assert Wordlist.in_dir(str(tmpdir)).primary_table
    assert Dictionary.in_dir(str(tmpdir)).primary_table
    assert StructureDataset.in_dir(str(tmpdir)).primary_table
Exemple #12
0
def test_duplicate_component(ds, tmpdir):
    # adding a component twice is not possible:
    ds.add_component('ValueTable')
    with pytest.raises(ValueError):
        ds.add_component('ValueTable')

    # JSON descriptions with duplicate components cannot be read:
    md = tmpdir / 'md.json'
    json = """\
{
    "@context": ["http://www.w3.org/ns/csvw", {"@language": "en"}],
    "dc:conformsTo": "http://cldf.clld.org/v1.0/terms.rdf#StructureDataset",
    "tables": [
        {"url": "values.csv"},
        COMPS 
    ]
}"""
    comp = """
{
    "url": "values.csv",
    "dc:conformsTo": "http://cldf.clld.org/v1.0/terms.rdf#ValueTable",
    "tableSchema": {
        "columns": [
            {
                "name": "ID",
                "propertyUrl": "http://cldf.clld.org/v1.0/terms.rdf#id"
            },
            {
                "name": "Language_ID",
                "propertyUrl": "http://cldf.clld.org/v1.0/terms.rdf#languageReference"
            },
            {
                "name": "Parameter_ID",
                "propertyUrl": "http://cldf.clld.org/v1.0/terms.rdf#parameterReference"
            },
            {
                "name": "Value",
                "propertyUrl": "http://cldf.clld.org/v1.0/terms.rdf#value"
            }
        ]
    }
}"""
    md.write_text(json.replace('COMPS', comp), encoding='utf8')
    (tmpdir / 'values.csv').write_text(
        "ID,Language_ID,Parameter_ID,Value\n1,1,1,1", encoding='utf8')
    ds = Dataset.from_metadata(str(md))
    assert ds.validate()

    md.write_text(json.replace('COMPS', ', '.join([comp, comp])), encoding='utf8')
    with pytest.raises(ValueError) as excinfo:
        Dataset.from_metadata(str(md))
    assert 'duplicate component' in excinfo.exconly()
Exemple #13
0
def test_Dataset_from_scratch(tmpdir, data):
    # An unknown file name cannot be used with Dataset.from_data:
    copy(str(data / 'ds1.csv'), str(tmpdir / 'xyz.csv'))
    with pytest.raises(ValueError):
        Dataset.from_data(str(tmpdir / 'xyz.csv'))

    # Known file name, but non-standard column name:
    write_text(str(tmpdir / 'values.csv'),
               "IDX,Language_ID,Parameter_ID,Value\n1,1,1,1")
    with pytest.raises(ValueError, match='missing columns'):
        ds = Dataset.from_data(str(tmpdir / 'values.csv'))

    # A known file name will determine the CLDF module of the dataset:
    copy(str(data / 'ds1.csv'), str(tmpdir / 'values.csv'))
    ds = Dataset.from_data(str(tmpdir / 'values.csv'))
    assert ds.module == 'StructureDataset'

    assert len(list(ds['ValueTable'])) == 2
    ds.validate()
    ds['ValueTable'].write(2 * list(ds['ValueTable']))
    with pytest.raises(ValueError):
        ds.validate()
    md = ds.write_metadata()
    Dataset.from_metadata(md)
    repr(ds)
    del ds.tablegroup.common_props['dc:conformsTo']
    Dataset.from_metadata(ds.write_metadata())
    assert len(ds.stats()) == 1
Exemple #14
0
    def test_validate(self):
        from pycldf.dataset import Dataset, REQUIRED_FIELDS

        ds = Dataset('name')
        with self.assertRaises(AssertionError):  # missing required fields!
            ds.fields = ('a',)

        with self.assertRaises(AssertionError):  # fields must be tuple
            ds.fields = [variants[-1] for variants in REQUIRED_FIELDS]

        ds.fields = tuple(variants[-1] for variants in REQUIRED_FIELDS)

        with self.assertRaises(ValueError):  # fields cannot be reassigned!
            ds.fields = tuple(variants[0] for variants in REQUIRED_FIELDS)
Exemple #15
0
def test_Dataset_from_scratch(tmpdir, data):
    # An unknown file name cannot be used with Dataset.from_data:
    copy(str(data / 'ds1.csv'), str(tmpdir / 'xyz.csv'))
    with pytest.raises(ValueError):
        Dataset.from_data(str(tmpdir / 'xyz.csv'))

    # Known file name, but non-standard column name:
    write_text(str(tmpdir / 'values.csv'), "IDX,Language_ID,Parameter_ID,Value\n1,1,1,1")
    with pytest.raises(ValueError, match='missing columns'):
        ds = Dataset.from_data(str(tmpdir / 'values.csv'))

    # A known file name will determine the CLDF module of the dataset:
    copy(str(data / 'ds1.csv'), str(tmpdir / 'values.csv'))
    ds = Dataset.from_data(str(tmpdir / 'values.csv'))
    assert ds.module == 'StructureDataset'

    assert len(list(ds['ValueTable'])) == 2
    ds.validate()
    ds['ValueTable'].write(2 * list(ds['ValueTable']))
    with pytest.raises(ValueError):
        ds.validate()
    md = ds.write_metadata()
    Dataset.from_metadata(md)
    repr(ds)
    del ds.tablegroup.common_props['dc:conformsTo']
    Dataset.from_metadata(ds.write_metadata())
    assert len(ds.stats()) == 1

    ds.add_table('extra.csv', 'ID')
    ds.write(**{'ValueTable': [], 'extra.csv': []})
    counts = {r[0]: r[2] for r in ds.stats()}
    assert counts['extra.csv'] == 0
Exemple #16
0
def test_Dataset_from_scratch(tmpdir, data):
    # An unknown file name cannot be used with Dataset.from_data:
    copy(str(data / 'ds1.csv'), str(tmpdir / 'xyz.csv'))
    with pytest.raises(ValueError):
        Dataset.from_data(str(tmpdir / 'xyz.csv'))

    # Known file name, but non-standard column name:
    Path(str(tmpdir / 'values.csv')).write_text(
        "IDX,Language_ID,Parameter_ID,Value\n1,1,1,1", encoding='utf-8')
    with pytest.raises(ValueError, match='missing columns'):
        ds = Dataset.from_data(str(tmpdir / 'values.csv'))

    # A known file name will determine the CLDF module of the dataset:
    copy(str(data / 'ds1.csv'), str(tmpdir / 'values.csv'))
    with warnings.catch_warnings(record=True):
        warnings.simplefilter("always")
        ds = Dataset.from_data(str(tmpdir / 'values.csv'))
        assert ds.module == 'StructureDataset'

        assert len(list(ds['ValueTable'])) == 2
        ds.validate()
        ds['ValueTable'].write(2 * list(ds['ValueTable']))
        with pytest.raises(ValueError):
            ds.validate()
        md = ds.write_metadata()
        Dataset.from_metadata(md)
        repr(ds)
        del ds.tablegroup.common_props['dc:conformsTo']
        Dataset.from_metadata(ds.write_metadata())
        assert len(ds.stats()) == 1

    ds.add_table('extra.csv', 'ID')
    ds.write(**{'ValueTable': [], 'extra.csv': []})
    counts = {r[0]: r[2] for r in ds.stats()}
    assert counts['extra.csv'] == 0
Exemple #17
0
    def __attrs_post_init__(self):
        if self.default_metadata_path:
            self.default_metadata_path = pathlib.Path(
                self.default_metadata_path)
            try:
                Dataset.from_metadata(self.default_metadata_path)
            except Exception:
                raise ValueError('invalid default metadata: {0}'.format(
                    self.default_metadata_path))
        else:
            self.default_metadata_path = pkg_path(
                'modules', '{0}{1}'.format(self.module, MD_SUFFIX))

        if not self.metadata_fname:
            self.metadata_fname = self.default_metadata_path.name
Exemple #18
0
def test_db_write(tmpdir, data):
    ds = Dataset.from_metadata(data / 'ds1.csv-metadata.json')
    db = Database(ds, fname=str(tmpdir.join('db.sqlite')))
    db.write_from_tg()
    #shutil.copy(str(tmpdir.join('db.sqlite')), 'db.sqlite')
    assert len(
        db.query(
            "select * from ValueTable where cldf_parameterReference = 'fid1'")
    ) == 1
    assert len(db.query('select * from SourceTable')) == 3
    assert len(
        db.query(
            "select valuetable_cldf_id from ValueTable_SourceTable where context = '2-5'"
        )) == 1

    assert db.read()['ValueTable'][0]['cldf_source'] == [
        '80086', 'meier2015[2-5]'
    ]
    db.to_cldf(str(tmpdir.join('cldf')))
    assert tmpdir.join('cldf', 'ds1.bib').check()
    assert '80086;meier2015[2-5]' in tmpdir.join('cldf',
                                                 'ds1.csv').read_text('utf8')

    with pytest.raises(ValueError):
        db.write_from_tg()

    with pytest.raises(NotImplementedError):
        db.write_from_tg(_exists_ok=True)

    db.write_from_tg(_force=True)
Exemple #19
0
def test_db(data, db):
    ds = Dataset.from_metadata(str(data / 'ds1.csv-metadata.json'))
    db.create()
    db.load(ds)
    assert len(db.fetchall("SELECT name FROM dataset")) == 1
    with pytest.raises(IntegrityError):
        db.load(ds)
    db.delete(db.fetchone("SELECT ID FROM dataset")[0])
    db.load(ds)
    db.drop()
Exemple #20
0
 def add_row(self, row):
     #
     # add segments column, value cleaned from "<>=..."
     #
     row = CldfDatasetBase.add_row(self, row)
     if row:
         for col, validator in self.validators.items():
             if not validator(row):
                 del self._rows[row['ID']]
                 return
     return row
Exemple #21
0
    def test_write_read(self):
        from pycldf.dataset import Dataset, REQUIRED_FIELDS

        row = ['1', 'abcd1234', 'fid', 'yes']
        ds = Dataset('name')
        ds.fields = tuple(v[0] for v in REQUIRED_FIELDS)
        ds.add_row(row)
        ds.write(self.tmp_path())
        self.assertTrue(self.tmp_path('name.csv').exists())
        ds2 = Dataset.from_file(self.tmp_path('name.csv'))
        self.assertEqual(list(ds2[0].values()), row)
        self.assertEqual(list(ds2['1'].values()), row)
Exemple #22
0
def test_db_write(tmpdir, data):
    #import shutil
    ds = Dataset.from_metadata(data / 'ds1.csv-metadata.json')
    db = Database(ds, fname=str(tmpdir.join('db.sqlite')))
    db.write_from_tg()
    #shutil.copy(str(tmpdir.join('db.sqlite')), 'db.sqlite')
    assert len(db.query("select * from ValueTable where cldf_parameterReference = 'fid1'")) == 1
    assert len(db.query('select * from SourceTable')) == 2
    assert len(db.query("select * from ValueTable_SourceTable where context = '2-5'")) == 1

    assert db.read()['ValueTable'][0]['cldf_source'] == ['80086', 'meier2015[2-5]']
    db.to_cldf(str(tmpdir.join('cldf')))
    assert tmpdir.join('cldf', 'ds1.bib').check()
    assert '80086;meier2015[2-5]' in tmpdir.join('cldf', 'ds1.csv').read_text('utf8')
Exemple #23
0
    def test_CldfDownload(self):
        from clld.web.adapters.cldf import CldfDownload

        tmp = self.tmp_path('dl.zip')
        dl = CldfDownload(Dataset, 'clld')
        dl.create(self.env['request'], verbose=False, outfile=tmp)
        ds = CldfDataset.from_zip(tmp)
        self.assertEqual(ds.name, 'dataset-contribution-contribution')
        self.assertEqual(
            'http://localhost/values/{ID}',
            ds.table.schema.aboutUrl)
        self.assertEqual(
            'http://localhost/languages/{Language_ID}',
            ds.table.schema.columns['Language_ID'].valueUrl)
        self.assertEqual(len(ds.rows), 3)
        self.assertIn('Language_glottocode', ds[0])
        self.assertIn('10-20', ds['value2']['Source'])
Exemple #24
0
def test_validators(tmpdir, mocker, data):
    copy(str(data / 'invalid.csv'), str(tmpdir / 'values.csv'))
    ds = Dataset.from_data(str(tmpdir / 'values.csv'))

    with pytest.raises(ValueError):
        ds.validate()

    log = mocker.Mock()
    ds.validate(log=log)
    assert log.warn.call_count == 2

    for col in ds.tablegroup.tables[0].tableSchema.columns:
        if col.name == 'Language_ID':
            col.propertyUrl.uri = 'http://cldf.clld.org/v1.0/terms.rdf#glottocode'

    log = mocker.Mock()
    ds.validate(log=log)
    assert log.warn.call_count == 4
Exemple #25
0
def test_validators(tmpdir, mocker, data):
    copy(str(data / 'invalid.csv'), str(tmpdir / 'values.csv'))
    ds = Dataset.from_data(str(tmpdir / 'values.csv'))

    with pytest.raises(ValueError):
        ds.validate()

    log = mocker.Mock()
    ds.validate(log=log)
    assert log.warn.call_count == 2

    for col in ds.tablegroup.tables[0].tableSchema.columns:
        if col.name == 'Language_ID':
            col.propertyUrl.uri = 'http://cldf.clld.org/v1.0/terms.rdf#glottocode'

    log = mocker.Mock()
    ds.validate(log=log)
    assert log.warn.call_count == 4
Exemple #26
0
def import_cldf(srcdir, md, languoids, conceptsets):
    with transaction.manager:
        contrib = Provider(
            id=srcdir.name,
            name=md['dc:title'],
            description=md.get('dc:bibliographicCitation'),
            url=md.get('dc:identifier'),
            license=md.get('dc:license'),
            aboutUrl=md.get('aboutUrl'),
        )
        DBSession.add(contrib)
        sources = {}
        cldfdir = srcdir.joinpath('cldf')
        values = Data()
        for fname in tqdm(list(cldfdir.glob('*' + MD_SUFFIX)), leave=False):
            ds = Dataset.from_metadata(fname)
            for src in ds.sources.items():
                if src.id not in sources:
                    sources[src.id] = cldf2clld(src, contrib, len(sources) + 1)
            import_dataset(ds, contrib, languoids, conceptsets, sources,
                           values)
            DBSession.flush()
        # import cognates:
        if cldfdir.joinpath('cognates.csv').exists():
            for csid, cognates in groupby(
                    reader(cldfdir.joinpath('cognates.csv'), dicts=True),
                    lambda i: i['Cognate_set_ID']):
                cs = Cognateset(id=unique_id(contrib, csid),
                                contribution=contrib)
                for cognate in cognates:
                    cp = values['Counterpart'].get(cognate['Word_ID'])
                    if cp:
                        DBSession.add(
                            CognatesetCounterpart(
                                cognateset=cs,
                                counterpart=cp,
                                cognate_detection_method=cognate[
                                    'Cognate_detection_method'],
                                alignment=cognate['Alignment'],
                                alignment_method=cognate['Alignment_method'],
                                doubt=cognate['Doubt'] == 'True'))
def test_CldfDownload(env, tmppath, mocker, capsys):
    from clld.web.adapters.cldf import CldfDownload

    mocker.patch('clld.web.adapters.cldf.transaction')
    tmp = tmppath / 'dl.zip'
    dl = CldfDownload(Dataset, 'clld')
    dl.create(env['request'], outfile=tmp, verbose=True)
    out, err = capsys.readouterr()
    assert 'Value' in out

    outdir = tmppath / 'cldf'
    with ZipFile(tmp.as_posix()) as zip:
        assert 'Wordlist-metadata.json' in zip.namelist()
        zip.extractall(str(outdir))

    ds = CldfDataset.from_metadata(outdir.joinpath('Wordlist-metadata.json'))
    assert ds.module == 'Wordlist'
    values = list(ds[ds.primary_table])
    assert len(values) == 3
    for v in values:
        list(ds.sources.expand_refs(v['Source']))
Exemple #28
0
def test_CldfDownload(env, tmppath, mocker, capsys):
    from clld.web.adapters.cldf import CldfDownload

    mocker.patch('clld.web.adapters.cldf.transaction')
    tmp = tmppath / 'dl.zip'
    dl = CldfDownload(Dataset, 'clld')
    dl.create(env['request'], outfile=tmp, verbose=True)
    out, err = capsys.readouterr()
    assert 'Value' in out

    outdir = tmppath / 'cldf'
    with ZipFile(tmp.as_posix()) as zip:
        assert 'Wordlist-metadata.json' in zip.namelist()
        zip.extractall(str(outdir))

    ds = CldfDataset.from_metadata(outdir.joinpath('Wordlist-metadata.json'))
    assert ds.module == 'Wordlist'
    values = list(ds[ds.primary_table])
    assert len(values) == 3
    for v in values:
        list(ds.sources.expand_refs(v['Source']))
Exemple #29
0
def import_cldf(srcdir, md, languoids, conceptsets):
    with transaction.manager:
        contrib = Provider(
            id=srcdir.name,
            name=md['dc:title'],
            description=md.get('dc:bibliographicCitation'),
            url=md.get('dc:identifier'),
            license=md.get('dc:license'),
            aboutUrl=md.get('aboutUrl'),
        )
        DBSession.add(contrib)
        sources = {}
        cldfdir = srcdir.joinpath('cldf')
        values = Data()
        for fname in tqdm(list(cldfdir.glob('*' + MD_SUFFIX)), leave=False):
            ds = Dataset.from_metadata(fname)
            for src in ds.sources.items():
                if src.id not in sources:
                    sources[src.id] = cldf2clld(src, contrib, len(sources) + 1)
            import_dataset(ds, contrib, languoids, conceptsets, sources, values)
            DBSession.flush()
        # import cognates:
        if cldfdir.joinpath('cognates.csv').exists():
            for csid, cognates in groupby(
                    reader(cldfdir.joinpath('cognates.csv'), dicts=True),
                    lambda i: i['Cognate_set_ID']):
                cs = Cognateset(id=unique_id(contrib, csid), contribution=contrib)
                for cognate in cognates:
                    cp = values['Counterpart'].get(cognate['Word_ID'])
                    if cp:
                        DBSession.add(CognatesetCounterpart(
                            cognateset=cs,
                            counterpart=cp,
                            cognate_detection_method=cognate['Cognate_detection_method'],
                            alignment=cognate['Alignment'],
                            alignment_method=cognate['Alignment_method'],
                            doubt=cognate['Doubt'] == 'True'))
Exemple #30
0
def areality(metadata, feature_id):
    # Read the CLDF Dataset specified by a metadata file:
    grambank = Dataset.from_metadata(metadata)
    for feature in grambank['ParameterTable']:
        if feature['ID'] == feature_id:
            break
    else:
        raise ValueError('unknown Grambank feature ID: {0}'.format(feature_id))

    # Information about macroareas is stored in table with language metadata:
    area_map = {l['ID']: l['Macroarea'] for l in grambank['LanguageTable']}

    # We want to map numeric feature values to more descriptive ones, thus we
    # have to read the value descriptions from the parameters table:
    codes = {c['ID']: c['Description'] for c in grambank['CodeTable']}

    res = Counter()
    for value in grambank['ValueTable']:
        if value['Parameter_ID'] == feature_id:
            # We count the occurrences of each value by (area, code):
            res.update([(
                area_map[value['Language_ID']], 
                codes[value['Code_ID']] if value['Code_ID'] else 'Not known')])
    return feature, res
Exemple #31
0
def itercldf(dataset, id_):
    archive = Archive(dataset.raw.joinpath('{0}.zip'.format(id_)))
    for name in archive.namelist():
        if name.endswith(MD_SUFFIX):
            yield Dataset.from_metadata(Path(name), archive)
Exemple #32
0
def read_cldf_data(config):
    """
    Read CLDF data as lists of Python dictionaries.

    This function interfaces with `pycldf`. The tables and columns to
    extract are obtained from `*_fields` entries in `config`.

    Parameters
    ----------
    config : dict
        A dictionary with the configurations.
    """

    # Read dataset from metadata
    metadata = config["base_path"] / "demo_cldf" / "cldf-metadata.json"
    dataset = Dataset.from_metadata(metadata.as_posix())

    # Transform the dataset in a Python datastructure (`cldf_data`) suitable
    # for Jinja template manipulation. `cldf_data` is a dictionary of
    # tables, where the key is the table_name and value is a dictionary
    # of `columns` (with the sorted list of column names, found in the
    # rows), and `rows`. `rows` is a list of dictionaries, with the
    # `value` to be reported and optionally other information (such as the
    # `url`) which may or may not be used by the template
    # (`value` is always used).
    # TODO: make conversion less explicit and with fewer loops
    # table.base -> /home/tresoldi/src/staticcldf/demo_cldf
    # table.url -> cognates.csv
    # table.local_name -> cognates.csv
    # for col in table.tableSchema.columns:
    #   - col.datatype.base -> string, decimal
    #   - col.header -> Alignment_Source
    #   - col.name -> Alignment_Source
    #   - col.propertyUrl -> None, http://cldf.clld.org/v1.0/terms.rdf#alignment
    #   - col.valueUrl -> None, http://glottolog.org/resource/languoid/id/{glottolog_id}
    cldf_data = {}
    for table in dataset.tables:
        table_key = table.local_name.split(".")[0]

        column_names = [col.name for col in table.tableSchema.columns]
        valueUrls = [col.valueUrl for col in table.tableSchema.columns]
        datatypes = [col.datatype.base for col in table.tableSchema.columns]

        # Holder for the table values in the returned structure
        table_data = []

        # Iterate over all rows for the current table
        for row in table:
            # Holder for the row in the returned structure
            row_data = []

            # Iterate over all columns for the current row
            for column, valueUrl in zip(column_names, valueUrls):
                if not row[column]:
                    value = ""
                elif isinstance(row[column], (list, tuple)):
                    value = " ".join([str(value) for value in row[column]])
                else:
                    value = str(row[column])

                if valueUrl:
                    # Ugly replacement, but works with CLDF metadata
                    # (assuming there is a single replacement)
                    var_name = list(valueUrl.variable_names)[0]
                    url = valueUrl.expand(**{var_name: value})
                else:
                    url = None

                # Append computed values to `row_data`
                row_data.append({"value": value, "url": url})

            # Append current row to the table
            table_data.append(row_data)

        #  Append contents to overall table
        column_data = [
            {"name": name, "datatype": datatype}
            for name, datatype in zip(column_names, datatypes)
        ]
        cldf_data[table_key] = {"columns": column_data, "rows": table_data}

    # TODO: remove those which are all empty or None

    return cldf_data
Exemple #33
0
def test_Dataset_from_data_empty_file(tmpdir):
    write_text(str(tmpdir / 'values.csv'), '')
    with pytest.raises(ValueError, match='empty data file'):
        Dataset.from_data(str(tmpdir / 'values.csv'))
Exemple #34
0
def test_Dataset_from_data_empty_file(tmpdir):
    Path(str(tmpdir / 'values.csv')).write_text('', encoding='utf-8')
    with pytest.raises(ValueError, match='empty data file'):
        Dataset.from_data(str(tmpdir / 'values.csv'))
Exemple #35
0
    def dataset(self, req):
        ds = Dataset('%s-%s-%s' % (
            req.dataset.id, self.obj.__class__.__name__.lower(), self.obj.id))
        cols = self.columns(req)
        ds.fields = tuple(col['name'] if isinstance(col, dict) else col for col in cols)
        ds.table.schema.aboutUrl = url_template(req, 'value', 'ID')

        for col in cols:
            if isinstance(col, dict):
                name = col.pop('name')
                for attr, value in col.items():
                    setattr(ds.table.schema.columns[name], attr, value)

        ds.metadata['dc:bibliographicCitation '] = text_citation(req, self.obj)
        ds.metadata['dc:publisher'] = '%s, %s' % (
            req.dataset.publisher_name, req.dataset.publisher_place)
        ds.metadata['dc:license'] = req.dataset.license
        ds.metadata['dc:issued'] = req.dataset.published.isoformat()
        ds.metadata['dc:title'] = self.obj.name
        ds.metadata['dc:creator'] = self.obj.formatted_contributors()
        ds.metadata['dc:identifier'] = req.resource_url(self.obj)
        ds.metadata['dc:isPartOf'] = req.resource_url(req.dataset)
        ds.metadata['dcat:accessURL'] = req.route_url('download')

        for value in self.value_query():
            refs, sources = self.refs_and_sources(req, value)
            row = self.row(req, value, refs)
            if row:
                ds.sources.add(*sources)
                ds.add_row(row)
        return ds
Exemple #36
0
def test_Dataset_from_data_empty_file(tmpdir):
    write_text(str(tmpdir / 'values.csv'), '')
    with pytest.raises(ValueError, match='empty data file'):
        Dataset.from_data(str(tmpdir / 'values.csv'))