Example #1
0
def test_Sources(tmpdir):
    src = Sources()
    src.add(BIB, Source(
        'book', 'huber2005', author='Herrmann Huber', year='2005', title='y'))
    for entry in src:
        assert entry.genre == 'book'
        break
    assert len(list(src.items())) == 3
    assert len(list(src.keys())) == 3
    refs = ['huber2005[1-6]', 'Obrazy', 'Elegie[34]']
    assert src.format_refs(*list(src.expand_refs(refs))) == refs
    assert '%s' % src['huber2005'] == 'Huber, Herrmann. 2005. y.'
    with pytest.raises(ValueError):
        src.add(5)
    with pytest.raises(ValueError):
        src.add('@misc{a.b,\n  author="a.b"\n}')
    with pytest.raises(ValueError):
        _ = src['unknown']
        assert _  # pragma: no cover
    with pytest.raises(ValueError):
        src.parse('a[x')
    with pytest.raises(ValueError):
        src.parse('[x]')
    with pytest.raises(ValueError):
        src.validate(['x'])

    bib = str(tmpdir / 'test.bib')
    src.write(bib)

    src2 = Sources()
    src2.read(bib)

    src2.write(bib, ids=['huber2005'])
    src = Sources.from_file(bib)
    assert len(src) == 1
Example #2
0
    def to_cldf(self, dest, mdname='cldf-metadata.json'):
        """
        Write the data from the db to a CLDF dataset according to the metadata in `self.dataset`.

        :param dest:
        :param mdname:
        :return: path of the metadata file
        """
        dest = Path(dest)
        if not dest.exists():
            dest.mkdir()

        data = self.read()

        if data[self.source_table_name]:
            sources = Sources()
            for src in data[self.source_table_name]:
                sources.add(Source(
                    src['genre'],
                    src['id'],
                    **{k: v for k, v in src.items() if k not in ['id', 'genre']}))
            sources.write(dest / self.dataset.properties.get('dc:source', 'sources.bib'))

        for table_type, items in data.items():
            try:
                table = self.dataset[table_type]
                table.common_props['dc:extent'] = table.write(
                    [self.retranslate(table, item) for item in items],
                    base=dest)
            except KeyError:
                assert table_type == self.source_table_name, table_type
        return self.dataset.write_metadata(dest / 'cldf-metadata.json')
Example #3
0
def test_Sources(tmpdir):
    src = Sources()
    src.add(BIB, Source(
        'book', 'huber2005', author='Herrmann Huber', year='2005', title='y'))
    for entry in src:
        assert entry.genre == 'book'
        break
    assert len(list(src.items())) == 3
    assert len(list(src.keys())) == 3
    refs = ['huber2005[1-6]', 'Obrazy', 'Elegie[34]']
    assert src.format_refs(*list(src.expand_refs(refs))) == refs
    assert '%s' % src['huber2005'] == 'Huber, Herrmann. 2005. y.'
    with pytest.raises(ValueError):
        src.add(5)
    with pytest.raises(ValueError):
        src.add('@misc{a.b,\n  author="a.b"\n}')
    with pytest.raises(ValueError):
        _ = src['unknown']
        assert _  # pragma: no cover
    with pytest.raises(ValueError):
        src.parse('a[x')
    with pytest.raises(ValueError):
        src.parse('[x]')
    with pytest.raises(ValueError):
        src.validate(['x'])

    bib = str(tmpdir / 'test.bib')
    src.write(bib)

    src2 = Sources()
    src2.read(bib)

    src2.write(bib, ids=['huber2005'])
    src = Sources.from_file(bib)
    assert len(src) == 1
Example #4
0
    def test_Sources_with_None_values(self):
        from pycldf.sources import Sources, Source

        src = Sources()
        src.add(Source('book', 'huber2005', title=None))
        bib = self.tmp_path('test.bib')
        src.write(bib.name, bib.parent)
Example #5
0
    def test_Sources(self):
        from pycldf.sources import Sources, Source

        src = Sources()
        src.add(BIB, Source(
            'book', 'huber2005', author='Herrmann Huber', year='2005', title='y'))
        self.assertEqual(len(list(src.items())), 3)
        self.assertEqual(len(list(src.keys())), 3)
        refs = 'huber2005[1-6];Obrazy;Elegie[34]'
        self.assertEqual(src.format_refs(*list(src.expand_refs(refs))), refs)
        self.assertEqual('%s' % src['huber2005'], 'Huber, Herrmann. 2005. y.')
        with self.assertRaises(ValueError):
            src.add(5)
        with self.assertRaises(ValueError):
            src.add('@misc{a.b,\n  author="a.b"\n}')

        bib = self.tmp_path('test.bib')
        src.write(bib.name, bib.parent)

        src2 = Sources()
        src2.read(bib.name, bib.parent)

        bib = self.tmp_path('test.bib')
        src2.write(bib.name, bib.parent, ids=['huber2005'])
        src = Sources()
        src.read(bib.name, bib.parent)
        self.assertEqual(len(src), 1)
Example #6
0
    def to_cldf(self, dest, mdname='cldf-metadata.json', coordinate_precision=4):
        """
        Write the data from the db to a CLDF dataset according to the metadata in `self.dataset`.

        :param dest:
        :param mdname:
        :return: path of the metadata file
        """
        dest = pathlib.Path(dest)
        if not dest.exists():
            dest.mkdir()

        data = self.read()

        if data[self.source_table_name]:
            sources = Sources()
            for src in data[self.source_table_name]:
                sources.add(Source(
                    src['genre'],
                    src['id'],
                    **{k: v for k, v in src.items() if k not in ['id', 'genre']}))
            sources.write(dest / self.dataset.properties.get('dc:source', 'sources.bib'))

        for table_type, items in data.items():
            try:
                table = self.dataset[table_type]
                items = [
                    self.round_geocoordinates(item, precision=coordinate_precision)
                    for item in items]
                table.common_props['dc:extent'] = table.write(
                    [self.retranslate(table, item) for item in items],
                    base=dest)
            except KeyError:
                assert table_type == self.source_table_name, table_type
        return self.dataset.write_metadata(dest / mdname)
Example #7
0
def test_field_order(tmpdir):
    srcs = Sources()
    src = Source('misc', 'x')  # src is an OrderedDict and we add title *after* year.
    src['year'] = '2018'
    src['title'] = 'The Title'
    srcs.add(src)
    bib = tmpdir / 'test.bib'
    srcs.write(str(bib))
    res = bib.read_text(encoding='utf8')
    # Still, title should be printed in the BibTeX before year:
    assert res.index('title =') < res.index('year =')
Example #8
0
def test_Sources_roundtrip_latex(tmpdir, bibtex, expected):
    src = Sources()
    src.add(bibtex)
    bib = tmpdir / 'test.bib'
    src.write(str(bib))
    assert expected in bib.read_text('utf8')
Example #9
0
def test_Sources_with_None_values(tmpdir):
    src = Sources()
    src.add(Source('book', 'huber2005', title=None))
    bib = tmpdir / 'test.bib'
    src.write(str(bib))
Example #10
0
def test_Sources_roundtrip_latex(tmpdir, bibtex, expected):
    src = Sources()
    src.add(bibtex)
    bib = tmpdir / 'test.bib'
    src.write(str(bib))
    assert expected in bib.read_text('utf8')
Example #11
0
def test_Sources_with_None_values(tmpdir):
    src = Sources()
    src.add(Source('book', 'huber2005', title=None))
    bib = tmpdir / 'test.bib'
    src.write(str(bib))
Example #12
0
class Dataset(object):
    """
    API to access a CLDF dataset.
    """
    def __init__(self, name):
        assert NAME_PATTERN.match(name)
        self.name = name
        self.sources = Sources()
        self.metadata = Metadata()
        self._rows = OrderedDict()

        # We store the fields (a.k.a. header) as tuple because it must be immutable after
        # first assignment (since changing is not well defined when there are already
        # rows).
        self._fields = ()
        self._source_count = None
        self._cited_sources = set()
        self._table = None

    def __repr__(self):
        return '<%s %s>' % (self.__class__.__name__, self.name)

    def __len__(self):
        """The length of a dataset is the number of rows in the values file."""
        return len(self.rows)

    def __getitem__(self, item):
        """
        Individual rows can be accessed by integer index or by row ID.

        :param item: `int` to access row by index, `str` to access by row ID
        :return: `OrderedDict`
        """
        if isinstance(item, int):
            return self.rows[item]
        return self._rows[item]

    @property
    def fields(self):
        """
        Read-only property to access the fields (a.k.a. header) defined for the dataset.

        :return: `tuple` of field names
        """
        return self._fields

    @property
    def table(self):
        return self._table

    @fields.setter
    def fields(self, value):
        """
        Fields can be assigned (but only once) for a dataset.

        :param value: `tuple` of field names.
        """
        if self._fields:
            raise ValueError('fields can only be assigned once!')
        assert isinstance(value, tuple)
        assert all(any(field in value for field in variants)
                   for variants in REQUIRED_FIELDS)
        table = self.metadata.get_table()
        if table:
            assert list(value) == list(table.schema.columns.keys())
        else:
            table = self.metadata.add_table(
                'values',
                '',
                [{'name': col, 'datatype': 'string'} for col in value])
            table.schema.primaryKey = 'ID'
        self._table = table
        self._fields = value

    @property
    def rows(self):
        return list(self._rows.values())

    @property
    def stats(self):
        return dict(
            languages=set(row['Language_ID'] for row in self.rows),
            parameters=set(row['Parameter_ID'] for row in self.rows),
            rowcount=(
                len(self),
                sum([1 for row in self.rows
                     if row['Language_ID'] and row['Parameter_ID']])),
            values=Counter(row['Value'] for row in self.rows),
        )

    def add_row(self, row):
        if not row:
            return

        d = ValuesRow.from_list(self, row)
        if d['ID'] in self._rows:
            raise ValueError('duplicate row ID: %s' % d['ID'])
        for ref in self.sources.expand_refs(d.get('Source', '')):
            self._cited_sources.add(ref.source.id)
        self._rows[d['ID']] = d
        return d

    @staticmethod
    def filename(fname, type_):
        """
        Compute the path for optional CLDF files relative to a given values file.

        :param fname: Path of the values file
        :param type_: Type of the optional file
        :return: name of the optional file
        """
        if type_ == 'sources':
            return fname.stem + '.bib'
        if type_ == 'metadata':
            return fname.stem + fname.suffix + MD_SUFFIX
        raise ValueError(type_)  # pragma: no cover

    @staticmethod
    def _existing_file(fname):
        fname = Path(fname)
        assert fname.exists() and fname.is_file()
        return fname

    @classmethod
    def _from(cls, data, container=None, skip_on_error=False):
        container = container or data.parent
        dataset = cls(data.stem)
        dataset.metadata.read(Dataset.filename(data, 'metadata'), container)
        dataset._table = dataset.metadata.get_table()
        dataset.sources.read(Dataset.filename(data, 'sources'), container)
        delimiter = ','
        if dataset.table:
            delimiter = dataset.table.dialect.delimiter
        if data.suffix in TAB_SUFFIXES:
            delimiter = '\t'

        if isinstance(container, Archive):
            rows = container.read_text(data.name).split('\n')
        else:
            rows = data

        for i, row in enumerate(reader(rows, delimiter=delimiter)):
            if i == 0:
                dataset.fields = tuple(row)
            else:
                try:
                    dataset.add_row(row)
                except ValueError as e:
                    if skip_on_error:
                        log.warn('skipping row in line %s: %s' % (i + 1, e))
                    else:
                        raise e
        dataset.table.dialect.delimiter = delimiter
        dataset.table.url = data.name
        return dataset

    @classmethod
    def from_zip(cls, fname, name=None):
        archive = Archive(cls._existing_file(fname))
        return cls._from(
            Path(archive.metadata_name(prefix=name)[:-len(MD_SUFFIX)]), archive)

    @classmethod
    def from_metadata(cls, fname, container=None):
        fname = Path(fname)
        if not fname.name.endswith(MD_SUFFIX):
            raise ValueError('metadata file name must end with %s' % MD_SUFFIX)
        return cls._from(
            fname.parent.joinpath(fname.name[:-len(MD_SUFFIX)]), container=container)

    @classmethod
    def from_file(cls, fname, skip_on_error=False):
        """
        Factory method to create a `Dataset` from a CLDF values file.

        :param fname: Path of the CLDF values file.
        :return: `Dataset` instance.
        """
        return cls._from(cls._existing_file(fname), skip_on_error=skip_on_error)

    def write(self, outdir='.', suffix='.csv', cited_sources_only=False, archive=False):
        outdir = Path(outdir)
        if not outdir.exists():
            raise ValueError(outdir.as_posix())

        close = False
        if archive:
            if isinstance(archive, Archive):
                container = archive
            else:
                container = Archive(outdir.joinpath(self.name + '.zip'), mode='w')
                close = True
        else:
            container = outdir

        fname = Path(outdir).joinpath(self.name + suffix)
        if fname.suffix in TAB_SUFFIXES:
            self.table.dialect.delimiter = '\t'

        with UnicodeWriter(
                None if isinstance(container, Archive) else fname,
                delimiter=self.table.dialect.delimiter) as writer:
            writer.writerow(self.fields)
            for row in self.rows:
                writer.writerow(row.to_list())

        if isinstance(container, Archive):
            container.write_text(writer.read(), fname.name)
        self.table.url = fname.name

        self.metadata.write(Dataset.filename(fname, 'metadata'), container)
        ids = self._cited_sources if cited_sources_only else None
        self.sources.write(Dataset.filename(fname, 'sources'), container, ids=ids)
        if close:
            container.close()