Esempio n. 1
0
def test_Dataset_from_scratch(tmpdir, data):
    # An unknown file name cannot be used with Dataset.from_data:
    copy(str(data / 'ds1.csv'), str(tmpdir / 'xyz.csv'))
    with pytest.raises(ValueError):
        Dataset.from_data(str(tmpdir / 'xyz.csv'))

    # Known file name, but non-standard column name:
    write_text(str(tmpdir / 'values.csv'), "IDX,Language_ID,Parameter_ID,Value\n1,1,1,1")
    with pytest.raises(ValueError, match='missing columns'):
        ds = Dataset.from_data(str(tmpdir / 'values.csv'))

    # A known file name will determine the CLDF module of the dataset:
    copy(str(data / 'ds1.csv'), str(tmpdir / 'values.csv'))
    ds = Dataset.from_data(str(tmpdir / 'values.csv'))
    assert ds.module == 'StructureDataset'

    assert len(list(ds['ValueTable'])) == 2
    ds.validate()
    ds['ValueTable'].write(2 * list(ds['ValueTable']))
    with pytest.raises(ValueError):
        ds.validate()
    md = ds.write_metadata()
    Dataset.from_metadata(md)
    repr(ds)
    del ds.tablegroup.common_props['dc:conformsTo']
    Dataset.from_metadata(ds.write_metadata())
    assert len(ds.stats()) == 1

    ds.add_table('extra.csv', 'ID')
    ds.write(**{'ValueTable': [], 'extra.csv': []})
    counts = {r[0]: r[2] for r in ds.stats()}
    assert counts['extra.csv'] == 0
Esempio n. 2
0
def test_Dataset_from_scratch(tmpdir, data):
    # An unknown file name cannot be used with Dataset.from_data:
    copy(str(data / 'ds1.csv'), str(tmpdir / 'xyz.csv'))
    with pytest.raises(ValueError):
        Dataset.from_data(str(tmpdir / 'xyz.csv'))

    # Known file name, but non-standard column name:
    Path(str(tmpdir / 'values.csv')).write_text(
        "IDX,Language_ID,Parameter_ID,Value\n1,1,1,1", encoding='utf-8')
    with pytest.raises(ValueError, match='missing columns'):
        ds = Dataset.from_data(str(tmpdir / 'values.csv'))

    # A known file name will determine the CLDF module of the dataset:
    copy(str(data / 'ds1.csv'), str(tmpdir / 'values.csv'))
    with warnings.catch_warnings(record=True):
        warnings.simplefilter("always")
        ds = Dataset.from_data(str(tmpdir / 'values.csv'))
        assert ds.module == 'StructureDataset'

        assert len(list(ds['ValueTable'])) == 2
        ds.validate()
        ds['ValueTable'].write(2 * list(ds['ValueTable']))
        with pytest.raises(ValueError):
            ds.validate()
        md = ds.write_metadata()
        Dataset.from_metadata(md)
        repr(ds)
        del ds.tablegroup.common_props['dc:conformsTo']
        Dataset.from_metadata(ds.write_metadata())
        assert len(ds.stats()) == 1

    ds.add_table('extra.csv', 'ID')
    ds.write(**{'ValueTable': [], 'extra.csv': []})
    counts = {r[0]: r[2] for r in ds.stats()}
    assert counts['extra.csv'] == 0
Esempio n. 3
0
def test_Dataset_from_scratch(tmpdir, data):
    # An unknown file name cannot be used with Dataset.from_data:
    copy(str(data / 'ds1.csv'), str(tmpdir / 'xyz.csv'))
    with pytest.raises(ValueError):
        Dataset.from_data(str(tmpdir / 'xyz.csv'))

    # Known file name, but non-standard column name:
    write_text(str(tmpdir / 'values.csv'),
               "IDX,Language_ID,Parameter_ID,Value\n1,1,1,1")
    with pytest.raises(ValueError, match='missing columns'):
        ds = Dataset.from_data(str(tmpdir / 'values.csv'))

    # A known file name will determine the CLDF module of the dataset:
    copy(str(data / 'ds1.csv'), str(tmpdir / 'values.csv'))
    ds = Dataset.from_data(str(tmpdir / 'values.csv'))
    assert ds.module == 'StructureDataset'

    assert len(list(ds['ValueTable'])) == 2
    ds.validate()
    ds['ValueTable'].write(2 * list(ds['ValueTable']))
    with pytest.raises(ValueError):
        ds.validate()
    md = ds.write_metadata()
    Dataset.from_metadata(md)
    repr(ds)
    del ds.tablegroup.common_props['dc:conformsTo']
    Dataset.from_metadata(ds.write_metadata())
    assert len(ds.stats()) == 1
Esempio n. 4
0
def get_dataset(fname):
    """Load a CLDF dataset.

    Load the file as `json` CLDF metadata description file, or as metadata-free
    dataset contained in a single csv file.

    The distinction is made depending on the file extension: `.json` files are
    loaded as metadata descriptions, all other files are matched against the
    CLDF module specifications. Directories are checked for the presence of
    any CLDF datasets in undefined order of the dataset types.

    Parameters
    ----------
    fname : str or Path
        Path to a CLDF dataset

    Returns
    -------
    Dataset
    """
    fname = Path(fname)
    if not fname.exists():
        raise FileNotFoundError(
            '{:} does not exist'.format(fname))
    if fname.suffix == '.json':
        return Dataset.from_metadata(fname)
    return Dataset.from_data(fname)
Esempio n. 5
0
def _get_dataset(args):
    if len(args.args) < 1:
        raise ParserError('not enough arguments')
    fname = Path(args.args[0])
    if not fname.exists() or not fname.is_file():
        raise ParserError('%s is not an existing directory' % fname)
    if fname.suffix == '.json':
        return Dataset.from_metadata(fname)
    return Dataset.from_data(fname)
Esempio n. 6
0
def _get_dataset(args):
    if len(args.args) < 1:
        raise ParserError('not enough arguments')
    fname = Path(args.args[0])
    if not fname.exists() or not fname.is_file():
        raise ParserError('%s is not an existing directory' % fname)
    if fname.suffix == '.json':
        return Dataset.from_metadata(fname)
    return Dataset.from_data(fname)
Esempio n. 7
0
def test_validators(tmpdir, mocker, data):
    copy(str(data / 'invalid.csv'), str(tmpdir / 'values.csv'))
    ds = Dataset.from_data(str(tmpdir / 'values.csv'))

    with pytest.raises(ValueError):
        ds.validate()

    log = mocker.Mock()
    ds.validate(log=log)
    assert log.warn.call_count == 2

    for col in ds.tablegroup.tables[0].tableSchema.columns:
        if col.name == 'Language_ID':
            col.propertyUrl.uri = 'http://cldf.clld.org/v1.0/terms.rdf#glottocode'

    log = mocker.Mock()
    ds.validate(log=log)
    assert log.warn.call_count == 4
Esempio n. 8
0
def test_validators(tmpdir, mocker, data):
    copy(str(data / 'invalid.csv'), str(tmpdir / 'values.csv'))
    ds = Dataset.from_data(str(tmpdir / 'values.csv'))

    with pytest.raises(ValueError):
        ds.validate()

    log = mocker.Mock()
    ds.validate(log=log)
    assert log.warn.call_count == 2

    for col in ds.tablegroup.tables[0].tableSchema.columns:
        if col.name == 'Language_ID':
            col.propertyUrl.uri = 'http://cldf.clld.org/v1.0/terms.rdf#glottocode'

    log = mocker.Mock()
    ds.validate(log=log)
    assert log.warn.call_count == 4
Esempio n. 9
0
def test_Dataset_from_data_empty_file(tmpdir):
    Path(str(tmpdir / 'values.csv')).write_text('', encoding='utf-8')
    with pytest.raises(ValueError, match='empty data file'):
        Dataset.from_data(str(tmpdir / 'values.csv'))
Esempio n. 10
0
def test_Dataset_from_data_empty_file(tmpdir):
    write_text(str(tmpdir / 'values.csv'), '')
    with pytest.raises(ValueError, match='empty data file'):
        Dataset.from_data(str(tmpdir / 'values.csv'))
Esempio n. 11
0
def test_Dataset_from_data_empty_file(tmpdir):
    write_text(str(tmpdir / 'values.csv'), '')
    with pytest.raises(ValueError, match='empty data file'):
        Dataset.from_data(str(tmpdir / 'values.csv'))