Example #1
0
def test_newcol(tmpdir, db):
    ds = StructureDataset.in_dir(str(tmpdir / 'd'))

    # We rename the ID column of the ValueTable. Note that the propertyUrl
    # remains the same:
    ds['ValueTable', 'ID'].name = 'idx'
    ds['ValueTable'].tableSchema.columns.extend([
        Column(name='col1', datatype='anyURI'),
        Column(name='col2', datatype='integer'),
        Column(name='col3'),
    ])
    ds.write(ValueTable=[{
        'idx': '1',
        'Language_ID': 'l',
        'Parameter_ID': 'p',
        'Value': 'v',
        'Source': ['meier2015'],
        'col2': 5,
        'col1': anyURI().to_python('http://example.org')
    }])
    db.create()
    with pytest.raises(IntegrityError):  # A missing source is referenced!
        db.load(ds)
    ds.add_sources("@misc{meier2015,\ntitle={title}\n}")
    db.load(ds)
    assert db.fetchone("""\
select
  s.title
from
  SourceTable as s, ValueSource as vs, ValueTable as v
where
  s.ID = vs.Source_ID and vs.Value_ID = v.id and v.id = 1""")[0] == 'title'
    assert db.fetchone(
        "select col1 from valuetable")[0] == 'http://example.org'
    assert db.fetchone("select col2 from valuetable")[0] == 5
Example #2
0
def test_db(tmpdir, dataset, mocker, capsys):
    db = Database(str(tmpdir.join('lexibank.sqlite')))
    db.load(dataset)
    db.create(exists_ok=True)
    with pytest.raises(ValueError):
        db.create()
    db.create(force=True)
    db.load(dataset)
    db.load_glottolog_data(dataset.glottolog)
    db.load_concepticon_data(mocker.Mock(conceptsets={}))
    for sql in db.sql:
        db.fetchall(sql)
    with db.connection() as conn:
        db.fetchall('select * from dataset', conn=conn, verbose=True)
    out, _ = capsys.readouterr()
    assert 'select' in out

    db.create(force=True)
    db.load(dataset)
    cldf_ds = dataset.cldf_reader()
    cols = cldf_ds['FormTable'].tableSchema.columns
    cols.append(Column(name='custom'))
    cldf_ds.write_metadata()
    db.load(dataset)
    cols.pop()
    cols.append(Column(name='custom', datatype='integer'))
    cldf_ds.write_metadata()
    with pytest.raises(ValueError):
        db.load(dataset)
    cols.pop()
    cldf_ds.write_metadata()
    db.load(dataset)
Example #3
0
def make_column(spec):
    if isinstance(spec, string_types):
        if spec in TERMS.by_uri:
            return TERMS.by_uri[spec].to_column()
        return Column(name=spec, datatype='string')
    if isinstance(spec, dict):
        return Column.fromvalue(spec)
    if isinstance(spec, Column):
        return spec
    raise TypeError(spec)
Example #4
0
def make_column(spec):
    if isinstance(spec, string_types):
        if spec in TERMS.by_uri:
            return TERMS.by_uri[spec].to_column()
        return Column(name=spec, datatype='string')
    if isinstance(spec, dict):
        return Column.fromvalue(spec)
    if isinstance(spec, Column):
        return spec
    raise TypeError(spec)
Example #5
0
 def to_column(self):
     col = Column(
         name=self.csvw_prop('name') or self.element.find(qname(RDFS, 'label')).text,
         propertyUrl=self.element.attrib[qname(RDF, 'about')],
         datatype=self.csvw_prop('datatype') or 'string')
     for k in ['separator', 'null', 'valueUrl']:
         v = self.csvw_prop(k)
         if v:
             setattr(col, k, v)
     return col
Example #6
0
    def __str__(self):
        # We overwrite the base class' method to fix the order of columns.
        tg = TableGroup.fromvalue(self.MD)
        for col in sorted(
                self.column_labels, key=lambda t: (t == IPA_COLUMN, t.lower()), reverse=True):
            if col != self.GRAPHEME_COL:
                tg.tables[0].tableSchema.columns.append(
                    Column.fromvalue({"name": col, "null": self.NULL}))

        return tg.tables[0].write(self.iteritems(), fname=None).decode('utf8').strip()
Example #7
0
    def __init__(self, dataset):
        self._count = defaultdict(int)
        self._cognate_count = defaultdict(int)
        self.dataset = dataset

        md = self.dataset.cldf_dir / MD_NAME
        if not md.exists():
            md = self.dataset.cldf_dir / ALT_MD_NAME
            if not md.exists():
                md = self.dataset.cldf_dir / MD_NAME
                copy(Path(__file__).parent / MD_NAME, md)
        self.wl = Wordlist.from_metadata(md)
        default_cldf = Wordlist.from_metadata(
            Path(__file__).parent / 'cldf-metadata.json')

        self.objects = {}
        self._obj_index = {}
        for cls in [
                self.dataset.lexeme_class,
                self.dataset.language_class,
                self.dataset.concept_class,
                self.dataset.cognate_class,
        ]:
            self.objects[cls.__cldf_table__()] = []
            self._obj_index[cls.__cldf_table__()] = set()

            cols = set(
                col.header
                for col in self.wl[cls.__cldf_table__()].tableSchema.columns)
            properties = set(
                col.propertyUrl.uri
                for col in self.wl[cls.__cldf_table__()].tableSchema.columns
                if col.propertyUrl)
            for field in cls.fieldnames():
                try:
                    col = default_cldf[cls.__cldf_table__(), field]
                    #
                    # We added Latitude and Longitude to the default metadata later, and want to
                    # make sure, existing datasets are upgraded silently.
                    #
                    if field in ['Latitude', 'Longitude'] \
                            and cls.__cldf_table__() == 'LanguageTable':
                        properties.add(col.propertyUrl.uri)
                        self.wl[cls.__cldf_table__(),
                                field].propertyUrl = col.propertyUrl
                        self.wl[cls.__cldf_table__(),
                                field].datatype = col.datatype
                except KeyError:
                    col = Column(name=field, datatype="string")
                if (col.propertyUrl and col.propertyUrl.uri not in properties) or \
                        ((not col.propertyUrl) and (field not in cols)):
                    self.wl[cls.__cldf_table__()].tableSchema.columns.append(
                        col)
Example #8
0
def test_update(tmpdir, db):
    ds = Dictionary.in_dir(str(tmpdir / 'd1'))
    ds.write(EntryTable=[], SenseTable=[])
    ds2 = Dictionary.in_dir(str(tmpdir / 'd2'))
    ds2.write(EntryTable=[], SenseTable=[])
    db.create()
    db.load(ds)
    db.load(ds2)
    ds.tables[0].tableSchema.columns.append(
        Column(name='newcol', datatype='integer'))
    db.load(ds)
    ds.tables[0].tableSchema.columns[-1].datatype.base = 'string'
    with pytest.raises(ValueError):
        db.load(ds)
Example #9
0
    def __enter__(self):
        super().__enter__()
        default_cldf = Wordlist.from_metadata(
            pathlib.Path(__file__).parent / MD_NAME)

        self._obj_index = {}
        for cls in [
                self.dataset.lexeme_class,
                self.dataset.language_class,
                self.dataset.concept_class,
                self.dataset.cognate_class,
        ]:
            self.objects[cls.__cldf_table__()] = []
            self._obj_index[cls.__cldf_table__()] = set()

            cols = set(
                col.header
                for col in self.cldf[cls.__cldf_table__()].tableSchema.columns)
            properties = set(
                col.propertyUrl.uri
                for col in self.cldf[cls.__cldf_table__()].tableSchema.columns
                if col.propertyUrl)
            for field in cls.fieldnames():
                try:
                    col = default_cldf[cls.__cldf_table__(), field]
                    #
                    # We added Latitude and Longitude to the default metadata later, and want to
                    # make sure, existing datasets are upgraded silently.
                    #
                    if field in ['Latitude', 'Longitude'] \
                            and cls.__cldf_table__() == 'LanguageTable':  # pragma: no cover
                        properties.add(col.propertyUrl.uri)
                        self.cldf[cls.__cldf_table__(),
                                  field].propertyUrl = col.propertyUrl
                        self.cldf[cls.__cldf_table__(),
                                  field].datatype = col.datatype
                except KeyError:
                    col = Column(name=field, datatype="string")
                if (col.propertyUrl and col.propertyUrl.uri not in properties) or \
                        ((not col.propertyUrl) and (field not in cols)):
                    self.cldf[cls.__cldf_table__()].tableSchema.columns.append(
                        col)
        return self
Example #10
0
File: cldf.py Project: clld/clld
    def create(self, req, filename=None, verbose=True, outfile=None):
        cldf_cfg = req.registry.getUtility(ICldfConfig)

        with TemporaryDirectory() as tmpd:
            cls = getattr(dataset, cldf_cfg.module)
            ds = cls.in_dir(tmpd)
            ds.properties['dc:bibliographicCitation '] = text_citation(req, req.dataset)
            ds.properties['dc:publisher'] = '%s, %s' % (
                req.dataset.publisher_name, req.dataset.publisher_place)
            ds.properties['dc:license'] = req.dataset.license
            ds.properties['dc:issued'] = req.dataset.published.isoformat()
            ds.properties['dc:title'] = req.dataset.name
            ds.properties['dc:creator'] = req.dataset.formatted_editors()
            ds.properties['dc:identifier'] = req.resource_url(req.dataset)
            ds.properties['dcat:accessURL'] = req.route_url('download')
            if DBSession.query(Sentence).count():
                ds.add_component('ExampleTable')
            if DBSession.query(DomainElement).count():
                ds.add_component('CodeTable', {'name': 'Number', 'datatype': 'integer'})
            ds.add_component('ParameterTable')
            ds.add_component('LanguageTable')
            ds.add_table('contributions.csv', 'ID', 'Name', 'Description', 'Contributors')
            ds.add_columns(ds.primary_table, Column.fromvalue(
                {
                    'name': 'Contribution_ID',
                    'datatype': 'string',
                    'valueUrl': url_template(req, 'contribution', 'contribution').uri,
                }))
            ds.add_foreign_key(
                ds.primary_table, 'Contribution_ID', 'contributions.csv', 'ID')
            ds['LanguageTable'].aboutUrl = url_template(req, 'language', 'ID')
            ds['ParameterTable'].aboutUrl = url_template(req, 'parameter', 'ID')
            ds[ds.primary_table].aboutUrl = url_template(req, 'value', 'ID')

            cldf_cfg.custom_schema(req, ds)

            for src in cldf_cfg.query(Source):
                ds.sources.add(cldf_cfg.convert(Source, src, req))
            fname = outfile or self.abspath(req)

            transaction.abort()

            tabledata = defaultdict(list)
            for table, model in [
                ('ParameterTable', Parameter),
                ('CodeTable', DomainElement),
                ('LanguageTable', Language),
                ('ExampleTable', Sentence),
                ('contributions.csv', Contribution),
                (ds.primary_table, Value),
            ]:
                if verbose:
                    print('exporting {0} ...'.format(model))
                transaction.begin()
                for item in cldf_cfg.query(model):
                    tabledata[table].append(cldf_cfg.convert(model, item, req))
                transaction.abort()
                if verbose:
                    print('... done')

            transaction.begin()
            ds.write(**cldf_cfg.custom_tabledata(req, tabledata))
            ds.validate()

            shutil.make_archive(
                fname.parent.joinpath(fname.stem).as_posix(), 'zip', tmpd.as_posix())
Example #11
0
def test_make_column():
    assert make_column(term_uri('source')).separator == ';'
    assert make_column(Column('name')).datatype is None
    with pytest.raises(TypeError):
        make_column(5)
Example #12
0
    def create(self, req, filename=None, verbose=True, outfile=None):
        cldf_cfg = req.registry.getUtility(ICldfConfig)

        with TemporaryDirectory() as tmpd:
            cls = getattr(dataset, cldf_cfg.module)
            ds = cls.in_dir(tmpd)
            ds.properties['dc:bibliographicCitation'] = text_citation(
                req, req.dataset)
            ds.properties['dc:publisher'] = '%s, %s' % (
                req.dataset.publisher_name, req.dataset.publisher_place)
            ds.properties['dc:license'] = req.dataset.license
            ds.properties['dc:issued'] = req.dataset.published.isoformat()
            ds.properties['dc:title'] = req.dataset.name
            ds.properties['dc:creator'] = req.dataset.formatted_editors()
            ds.properties['dc:identifier'] = req.resource_url(req.dataset)
            ds.properties['dcat:accessURL'] = req.route_url('download')
            if DBSession.query(Sentence).count():
                ds.add_component('ExampleTable')
            if DBSession.query(DomainElement).count():
                ds.add_component('CodeTable', {
                    'name': 'Number',
                    'datatype': 'integer'
                })
            ds.add_component('ParameterTable')
            ds.add_component('LanguageTable')
            ds.add_table('contributions.csv', 'ID', 'Name', 'Description',
                         'Contributors')
            ds.add_columns(
                ds.primary_table,
                Column.fromvalue({
                    'name':
                    'Contribution_ID',
                    'datatype':
                    'string',
                    'valueUrl':
                    url_template(req, 'contribution', 'contribution').uri,
                }))
            ds.add_foreign_key(ds.primary_table, 'Contribution_ID',
                               'contributions.csv', 'ID')
            ds['LanguageTable'].aboutUrl = url_template(req, 'language', 'ID')
            ds['ParameterTable'].aboutUrl = url_template(
                req, 'parameter', 'ID')
            ds[ds.primary_table].aboutUrl = url_template(req, 'value', 'ID')

            cldf_cfg.custom_schema(req, ds)

            for src in cldf_cfg.query(Source):
                ds.sources.add(cldf_cfg.convert(Source, src, req))
            fname = outfile or self.abspath(req)

            transaction.abort()

            tabledata = defaultdict(list)
            for table, model in [
                ('ParameterTable', Parameter),
                ('CodeTable', DomainElement),
                ('LanguageTable', Language),
                ('ExampleTable', Sentence),
                ('contributions.csv', Contribution),
                (ds.primary_table, Value),
            ]:
                if verbose:
                    print('exporting {0} ...'.format(model))
                transaction.begin()
                for item in cldf_cfg.query(model):
                    tabledata[table].append(cldf_cfg.convert(model, item, req))
                transaction.abort()
                if verbose:
                    print('... done')

            transaction.begin()
            ds.write(**cldf_cfg.custom_tabledata(req, tabledata))
            ds.validate()

            shutil.make_archive(str(fname.parent / fname.stem), 'zip',
                                str(tmpd))
Example #13
0
def add_metadata(fname: Path, logger: cli.logging.Logger = cli.logger):
    if fname.name != "forms.csv":
        cli.Exit.CLI_ARGUMENT_ERROR(
            "A metadata-free Wordlist must be in a file called 'forms.csv'.")
    default_wordlist = TableGroup.from_file(
        pycldf.util.pkg_path("modules", "Wordlist-metadata.json"))
    default_wordlist._fname = fname.with_name("Wordlist-metadata.json")
    ds = pycldf.Wordlist(default_wordlist)

    # `from_data` checks that the reqired columns of the FormTable are present,
    # but it does not consolidate the columns further.

    colnames = next(iterrows(fname))

    understood_colnames = {
        c.name
        for c in ds[ds.primary_table].tableSchema.columns if c.name in colnames
    }
    more_columns = {
        c.propertyUrl.uri: c
        for c in ds[ds.primary_table].tableSchema.columns
        if c.name not in understood_colnames
    }
    logger.info(
        "CLDF freely understood the columns %s in your forms.csv.",
        sorted(understood_colnames),
    )

    # Consider the columns that were not understood.
    columns_without_metadata = set(colnames) - understood_colnames
    for column_name in columns_without_metadata:
        column: Column
        # Maybe they are known CLDF properties?
        if column_name in pycldf.terms.TERMS:
            column = pycldf.TERMS[column_name].to_column()
        # Maybe they are CLDF default column names?
        elif column_name in DEFAULT_NAME_COLUMNS:
            column = DEFAULT_NAME_COLUMNS[column_name]
        # Maybe they are columns that Lexedata knows to handle?
        elif column_name in LEXEDATA_COLUMNS:
            column = LEXEDATA_COLUMNS[column_name]
        # Maybe they are columns inherited from LingPy?
        elif column_name.upper() in LINGPY_COLUMNS:
            column = LINGPY_COLUMNS[column_name.upper()]
        # Maybe they are some name we have seen before?
        elif column_name in OTHER_KNOWN_COLUMNS:
            column = OTHER_KNOWN_COLUMNS[column_name]
        else:
            # TODO: Maybe they look like they have a specific type?
            ...
            # Otherwise, they are probably just text to be kept.
            column = Column(
                datatype=Datatype(base="string"),
                default="",
                null=[""],
                name=column_name,
            )
        column.name = column_name

        ds[ds.primary_table].tableSchema.columns.append(column)
        summary = column.propertyUrl or column.datatype
        logger.info(f"Column {column_name} seems to be a {summary} column.")
        if column.propertyUrl:
            to_be_replaced = more_columns.pop(column.propertyUrl.uri, None)
            if to_be_replaced is not None:
                ds[ds.primary_table].tableSchema.columns.remove(to_be_replaced)

    for column in more_columns.values():
        logger.info(
            f"Also added column {column.name}, as expected for a FormTable.")

    ds[ds.primary_table].tableSchema.columns.sort(
        key=lambda k: colnames.index(k.name) if k.name in colnames else 1e10)

    # TODO: Once lexedata is properly published, we can give a better URL.
    ds.properties["dc:contributor"] = [
        "https://github.com/Anaphory/lexedata/blob/master/src/lexedata/edit/add_metadata.py"
    ]
    return ds
Example #14
0
from csvw.dsv import iterrows
from csvw.metadata import Column, Datatype, TableGroup

from lexedata import cli

DEFAULT_NAME_COLUMNS = {
    column.name: column
    for column in (term.to_column() for term in pycldf.TERMS.values())
}

LEXEDATA_COLUMNS = {
    "Status":
    Column(
        datatype=Datatype(base="string"),
        default="",
        lang="eng",
        null=[""],
        name="Value",
        aboutUrl="...",
    ),
    "Orthographic":
    Column(
        datatype=Datatype(base="string"),
        default="",
        null=[""],
        name="Orthographic",
        aboutUrl="...",
    ),
    "Phonemic":
    Column(
        datatype=Datatype(base="string"),
        default="",