def test_newcol(tmpdir, db): ds = StructureDataset.in_dir(str(tmpdir / 'd')) # We rename the ID column of the ValueTable. Note that the propertyUrl # remains the same: ds['ValueTable', 'ID'].name = 'idx' ds['ValueTable'].tableSchema.columns.extend([ Column(name='col1', datatype='anyURI'), Column(name='col2', datatype='integer'), Column(name='col3'), ]) ds.write(ValueTable=[{ 'idx': '1', 'Language_ID': 'l', 'Parameter_ID': 'p', 'Value': 'v', 'Source': ['meier2015'], 'col2': 5, 'col1': anyURI().to_python('http://example.org') }]) db.create() with pytest.raises(IntegrityError): # A missing source is referenced! db.load(ds) ds.add_sources("@misc{meier2015,\ntitle={title}\n}") db.load(ds) assert db.fetchone("""\ select s.title from SourceTable as s, ValueSource as vs, ValueTable as v where s.ID = vs.Source_ID and vs.Value_ID = v.id and v.id = 1""")[0] == 'title' assert db.fetchone( "select col1 from valuetable")[0] == 'http://example.org' assert db.fetchone("select col2 from valuetable")[0] == 5
def test_db(tmpdir, dataset, mocker, capsys): db = Database(str(tmpdir.join('lexibank.sqlite'))) db.load(dataset) db.create(exists_ok=True) with pytest.raises(ValueError): db.create() db.create(force=True) db.load(dataset) db.load_glottolog_data(dataset.glottolog) db.load_concepticon_data(mocker.Mock(conceptsets={})) for sql in db.sql: db.fetchall(sql) with db.connection() as conn: db.fetchall('select * from dataset', conn=conn, verbose=True) out, _ = capsys.readouterr() assert 'select' in out db.create(force=True) db.load(dataset) cldf_ds = dataset.cldf_reader() cols = cldf_ds['FormTable'].tableSchema.columns cols.append(Column(name='custom')) cldf_ds.write_metadata() db.load(dataset) cols.pop() cols.append(Column(name='custom', datatype='integer')) cldf_ds.write_metadata() with pytest.raises(ValueError): db.load(dataset) cols.pop() cldf_ds.write_metadata() db.load(dataset)
def make_column(spec): if isinstance(spec, string_types): if spec in TERMS.by_uri: return TERMS.by_uri[spec].to_column() return Column(name=spec, datatype='string') if isinstance(spec, dict): return Column.fromvalue(spec) if isinstance(spec, Column): return spec raise TypeError(spec)
def to_column(self): col = Column( name=self.csvw_prop('name') or self.element.find(qname(RDFS, 'label')).text, propertyUrl=self.element.attrib[qname(RDF, 'about')], datatype=self.csvw_prop('datatype') or 'string') for k in ['separator', 'null', 'valueUrl']: v = self.csvw_prop(k) if v: setattr(col, k, v) return col
def __str__(self): # We overwrite the base class' method to fix the order of columns. tg = TableGroup.fromvalue(self.MD) for col in sorted( self.column_labels, key=lambda t: (t == IPA_COLUMN, t.lower()), reverse=True): if col != self.GRAPHEME_COL: tg.tables[0].tableSchema.columns.append( Column.fromvalue({"name": col, "null": self.NULL})) return tg.tables[0].write(self.iteritems(), fname=None).decode('utf8').strip()
def __init__(self, dataset): self._count = defaultdict(int) self._cognate_count = defaultdict(int) self.dataset = dataset md = self.dataset.cldf_dir / MD_NAME if not md.exists(): md = self.dataset.cldf_dir / ALT_MD_NAME if not md.exists(): md = self.dataset.cldf_dir / MD_NAME copy(Path(__file__).parent / MD_NAME, md) self.wl = Wordlist.from_metadata(md) default_cldf = Wordlist.from_metadata( Path(__file__).parent / 'cldf-metadata.json') self.objects = {} self._obj_index = {} for cls in [ self.dataset.lexeme_class, self.dataset.language_class, self.dataset.concept_class, self.dataset.cognate_class, ]: self.objects[cls.__cldf_table__()] = [] self._obj_index[cls.__cldf_table__()] = set() cols = set( col.header for col in self.wl[cls.__cldf_table__()].tableSchema.columns) properties = set( col.propertyUrl.uri for col in self.wl[cls.__cldf_table__()].tableSchema.columns if col.propertyUrl) for field in cls.fieldnames(): try: col = default_cldf[cls.__cldf_table__(), field] # # We added Latitude and Longitude to the default metadata later, and want to # make sure, existing datasets are upgraded silently. # if field in ['Latitude', 'Longitude'] \ and cls.__cldf_table__() == 'LanguageTable': properties.add(col.propertyUrl.uri) self.wl[cls.__cldf_table__(), field].propertyUrl = col.propertyUrl self.wl[cls.__cldf_table__(), field].datatype = col.datatype except KeyError: col = Column(name=field, datatype="string") if (col.propertyUrl and col.propertyUrl.uri not in properties) or \ ((not col.propertyUrl) and (field not in cols)): self.wl[cls.__cldf_table__()].tableSchema.columns.append( col)
def test_update(tmpdir, db): ds = Dictionary.in_dir(str(tmpdir / 'd1')) ds.write(EntryTable=[], SenseTable=[]) ds2 = Dictionary.in_dir(str(tmpdir / 'd2')) ds2.write(EntryTable=[], SenseTable=[]) db.create() db.load(ds) db.load(ds2) ds.tables[0].tableSchema.columns.append( Column(name='newcol', datatype='integer')) db.load(ds) ds.tables[0].tableSchema.columns[-1].datatype.base = 'string' with pytest.raises(ValueError): db.load(ds)
def __enter__(self): super().__enter__() default_cldf = Wordlist.from_metadata( pathlib.Path(__file__).parent / MD_NAME) self._obj_index = {} for cls in [ self.dataset.lexeme_class, self.dataset.language_class, self.dataset.concept_class, self.dataset.cognate_class, ]: self.objects[cls.__cldf_table__()] = [] self._obj_index[cls.__cldf_table__()] = set() cols = set( col.header for col in self.cldf[cls.__cldf_table__()].tableSchema.columns) properties = set( col.propertyUrl.uri for col in self.cldf[cls.__cldf_table__()].tableSchema.columns if col.propertyUrl) for field in cls.fieldnames(): try: col = default_cldf[cls.__cldf_table__(), field] # # We added Latitude and Longitude to the default metadata later, and want to # make sure, existing datasets are upgraded silently. # if field in ['Latitude', 'Longitude'] \ and cls.__cldf_table__() == 'LanguageTable': # pragma: no cover properties.add(col.propertyUrl.uri) self.cldf[cls.__cldf_table__(), field].propertyUrl = col.propertyUrl self.cldf[cls.__cldf_table__(), field].datatype = col.datatype except KeyError: col = Column(name=field, datatype="string") if (col.propertyUrl and col.propertyUrl.uri not in properties) or \ ((not col.propertyUrl) and (field not in cols)): self.cldf[cls.__cldf_table__()].tableSchema.columns.append( col) return self
def create(self, req, filename=None, verbose=True, outfile=None): cldf_cfg = req.registry.getUtility(ICldfConfig) with TemporaryDirectory() as tmpd: cls = getattr(dataset, cldf_cfg.module) ds = cls.in_dir(tmpd) ds.properties['dc:bibliographicCitation '] = text_citation(req, req.dataset) ds.properties['dc:publisher'] = '%s, %s' % ( req.dataset.publisher_name, req.dataset.publisher_place) ds.properties['dc:license'] = req.dataset.license ds.properties['dc:issued'] = req.dataset.published.isoformat() ds.properties['dc:title'] = req.dataset.name ds.properties['dc:creator'] = req.dataset.formatted_editors() ds.properties['dc:identifier'] = req.resource_url(req.dataset) ds.properties['dcat:accessURL'] = req.route_url('download') if DBSession.query(Sentence).count(): ds.add_component('ExampleTable') if DBSession.query(DomainElement).count(): ds.add_component('CodeTable', {'name': 'Number', 'datatype': 'integer'}) ds.add_component('ParameterTable') ds.add_component('LanguageTable') ds.add_table('contributions.csv', 'ID', 'Name', 'Description', 'Contributors') ds.add_columns(ds.primary_table, Column.fromvalue( { 'name': 'Contribution_ID', 'datatype': 'string', 'valueUrl': url_template(req, 'contribution', 'contribution').uri, })) ds.add_foreign_key( ds.primary_table, 'Contribution_ID', 'contributions.csv', 'ID') ds['LanguageTable'].aboutUrl = url_template(req, 'language', 'ID') ds['ParameterTable'].aboutUrl = url_template(req, 'parameter', 'ID') ds[ds.primary_table].aboutUrl = url_template(req, 'value', 'ID') cldf_cfg.custom_schema(req, ds) for src in cldf_cfg.query(Source): ds.sources.add(cldf_cfg.convert(Source, src, req)) fname = outfile or self.abspath(req) transaction.abort() tabledata = defaultdict(list) for table, model in [ ('ParameterTable', Parameter), ('CodeTable', DomainElement), ('LanguageTable', Language), ('ExampleTable', Sentence), ('contributions.csv', Contribution), (ds.primary_table, Value), ]: if verbose: print('exporting {0} ...'.format(model)) transaction.begin() for item in cldf_cfg.query(model): tabledata[table].append(cldf_cfg.convert(model, item, req)) transaction.abort() if verbose: print('... done') transaction.begin() ds.write(**cldf_cfg.custom_tabledata(req, tabledata)) ds.validate() shutil.make_archive( fname.parent.joinpath(fname.stem).as_posix(), 'zip', tmpd.as_posix())
def test_make_column(): assert make_column(term_uri('source')).separator == ';' assert make_column(Column('name')).datatype is None with pytest.raises(TypeError): make_column(5)
def create(self, req, filename=None, verbose=True, outfile=None): cldf_cfg = req.registry.getUtility(ICldfConfig) with TemporaryDirectory() as tmpd: cls = getattr(dataset, cldf_cfg.module) ds = cls.in_dir(tmpd) ds.properties['dc:bibliographicCitation'] = text_citation( req, req.dataset) ds.properties['dc:publisher'] = '%s, %s' % ( req.dataset.publisher_name, req.dataset.publisher_place) ds.properties['dc:license'] = req.dataset.license ds.properties['dc:issued'] = req.dataset.published.isoformat() ds.properties['dc:title'] = req.dataset.name ds.properties['dc:creator'] = req.dataset.formatted_editors() ds.properties['dc:identifier'] = req.resource_url(req.dataset) ds.properties['dcat:accessURL'] = req.route_url('download') if DBSession.query(Sentence).count(): ds.add_component('ExampleTable') if DBSession.query(DomainElement).count(): ds.add_component('CodeTable', { 'name': 'Number', 'datatype': 'integer' }) ds.add_component('ParameterTable') ds.add_component('LanguageTable') ds.add_table('contributions.csv', 'ID', 'Name', 'Description', 'Contributors') ds.add_columns( ds.primary_table, Column.fromvalue({ 'name': 'Contribution_ID', 'datatype': 'string', 'valueUrl': url_template(req, 'contribution', 'contribution').uri, })) ds.add_foreign_key(ds.primary_table, 'Contribution_ID', 'contributions.csv', 'ID') ds['LanguageTable'].aboutUrl = url_template(req, 'language', 'ID') ds['ParameterTable'].aboutUrl = url_template( req, 'parameter', 'ID') ds[ds.primary_table].aboutUrl = url_template(req, 'value', 'ID') cldf_cfg.custom_schema(req, ds) for src in cldf_cfg.query(Source): ds.sources.add(cldf_cfg.convert(Source, src, req)) fname = outfile or self.abspath(req) transaction.abort() tabledata = defaultdict(list) for table, model in [ ('ParameterTable', Parameter), ('CodeTable', DomainElement), ('LanguageTable', Language), ('ExampleTable', Sentence), ('contributions.csv', Contribution), (ds.primary_table, Value), ]: if verbose: print('exporting {0} ...'.format(model)) transaction.begin() for item in cldf_cfg.query(model): tabledata[table].append(cldf_cfg.convert(model, item, req)) transaction.abort() if verbose: print('... done') transaction.begin() ds.write(**cldf_cfg.custom_tabledata(req, tabledata)) ds.validate() shutil.make_archive(str(fname.parent / fname.stem), 'zip', str(tmpd))
def add_metadata(fname: Path, logger: cli.logging.Logger = cli.logger): if fname.name != "forms.csv": cli.Exit.CLI_ARGUMENT_ERROR( "A metadata-free Wordlist must be in a file called 'forms.csv'.") default_wordlist = TableGroup.from_file( pycldf.util.pkg_path("modules", "Wordlist-metadata.json")) default_wordlist._fname = fname.with_name("Wordlist-metadata.json") ds = pycldf.Wordlist(default_wordlist) # `from_data` checks that the reqired columns of the FormTable are present, # but it does not consolidate the columns further. colnames = next(iterrows(fname)) understood_colnames = { c.name for c in ds[ds.primary_table].tableSchema.columns if c.name in colnames } more_columns = { c.propertyUrl.uri: c for c in ds[ds.primary_table].tableSchema.columns if c.name not in understood_colnames } logger.info( "CLDF freely understood the columns %s in your forms.csv.", sorted(understood_colnames), ) # Consider the columns that were not understood. columns_without_metadata = set(colnames) - understood_colnames for column_name in columns_without_metadata: column: Column # Maybe they are known CLDF properties? if column_name in pycldf.terms.TERMS: column = pycldf.TERMS[column_name].to_column() # Maybe they are CLDF default column names? elif column_name in DEFAULT_NAME_COLUMNS: column = DEFAULT_NAME_COLUMNS[column_name] # Maybe they are columns that Lexedata knows to handle? elif column_name in LEXEDATA_COLUMNS: column = LEXEDATA_COLUMNS[column_name] # Maybe they are columns inherited from LingPy? elif column_name.upper() in LINGPY_COLUMNS: column = LINGPY_COLUMNS[column_name.upper()] # Maybe they are some name we have seen before? elif column_name in OTHER_KNOWN_COLUMNS: column = OTHER_KNOWN_COLUMNS[column_name] else: # TODO: Maybe they look like they have a specific type? ... # Otherwise, they are probably just text to be kept. column = Column( datatype=Datatype(base="string"), default="", null=[""], name=column_name, ) column.name = column_name ds[ds.primary_table].tableSchema.columns.append(column) summary = column.propertyUrl or column.datatype logger.info(f"Column {column_name} seems to be a {summary} column.") if column.propertyUrl: to_be_replaced = more_columns.pop(column.propertyUrl.uri, None) if to_be_replaced is not None: ds[ds.primary_table].tableSchema.columns.remove(to_be_replaced) for column in more_columns.values(): logger.info( f"Also added column {column.name}, as expected for a FormTable.") ds[ds.primary_table].tableSchema.columns.sort( key=lambda k: colnames.index(k.name) if k.name in colnames else 1e10) # TODO: Once lexedata is properly published, we can give a better URL. ds.properties["dc:contributor"] = [ "https://github.com/Anaphory/lexedata/blob/master/src/lexedata/edit/add_metadata.py" ] return ds
from csvw.dsv import iterrows from csvw.metadata import Column, Datatype, TableGroup from lexedata import cli DEFAULT_NAME_COLUMNS = { column.name: column for column in (term.to_column() for term in pycldf.TERMS.values()) } LEXEDATA_COLUMNS = { "Status": Column( datatype=Datatype(base="string"), default="", lang="eng", null=[""], name="Value", aboutUrl="...", ), "Orthographic": Column( datatype=Datatype(base="string"), default="", null=[""], name="Orthographic", aboutUrl="...", ), "Phonemic": Column( datatype=Datatype(base="string"), default="",