def from_metadata(cls, fname): fname = pathlib.Path(fname) if fname.is_dir(): name = '{0}{1}'.format(cls.__name__, MD_SUFFIX) tablegroup = TableGroup.from_file(pkg_path('modules', name)) # adapt the path of the metadata file such that paths to tables are resolved # correctly: tablegroup._fname = fname.joinpath(name) else: tablegroup = TableGroup.from_file(fname) comps = collections.Counter() for table in tablegroup.tables: try: dt = Dataset.get_tabletype(table) if dt: comps.update([dt]) except ValueError: pass if comps and comps.most_common(1)[0][1] > 1: raise ValueError('{0}: duplicate components!'.format(fname)) for mod in get_modules(): if mod.match(tablegroup): return mod.cls(tablegroup) return cls(tablegroup)
def from_metadata(cls, fname): fname = Path(fname) if fname.is_dir(): name = '{0}{1}'.format(cls.__name__, MD_SUFFIX) tablegroup = TableGroup.from_file(pkg_path('modules', name)) # adapt the path of the metadata file such that paths to tables are resolved # correctly: tablegroup._fname = fname.joinpath(name) else: tablegroup = TableGroup.from_file(fname) for mod in get_modules(): if mod.match(tablegroup): return mod.cls(tablegroup) return cls(tablegroup)
def iteraliases(name, table_name=None): for table in TableGroup.from_file(data_path(name, 'metadata.json')).tables: if table_name is None or table.local_name == table_name: for row in table: if row: for alias in row['alias']: yield char(alias), char(row['char'])
def metadata(self): md = self.path.parent.joinpath(self.path.name + '-metadata.json') if not md.exists(): ddir = self._api.data_path() if hasattr(self._api, 'data_path') \ else REPOS_PATH.joinpath('concepticondata') if self.local: md = ddir.joinpath('conceptlists', 'local-metadata.json') if not md.exists(): md = ddir.joinpath('conceptlists', 'default-metadata.json') tg = TableGroup.from_file(md) if isinstance(self._api, Path): tg._fname = self._api.parent.joinpath(self._api.name + '-metadata.json') tg.tables[0].url = Link('{0}.tsv'.format(self.id)) return tg.tables[0]
def from_metadata(cls, fname): fname = Path(fname) if fname.is_dir(): name = '{0}{1}'.format(cls.__name__, MD_SUFFIX) tablegroup = TableGroup.from_file(pkg_path('modules', name)) # adapt the path of the metadata file such that paths to tables are resolved # correctly: tablegroup._fname = fname.joinpath(name) else: tablegroup = TableGroup.from_file(fname) comps = Counter() for table in tablegroup.tables: try: comps.update([Dataset.get_tabletype(table)]) except ValueError: pass if comps and comps.most_common(1)[0][1] > 1: raise ValueError('{0}: duplicate components!'.format(fname)) for mod in get_modules(): if mod.match(tablegroup): return mod.cls(tablegroup) return cls(tablegroup)
def get_modules(): global _modules if not _modules: ds = sys.modules[__name__] for p in pkg_path('modules').glob('*{0}'.format(MD_SUFFIX)): tg = TableGroup.from_file(p) mod = Module( tg.common_props['dc:conformsTo'], tg.tables[0].url.string if tg.tables else None) mod.cls = getattr(ds, mod.id) _modules.append(mod) # prefer Wordlist over ParallelText (forms.csv) _modules = sorted( _modules, key=lambda m: (m.cls in (Wordlist, ParallelText), m.cls is ParallelText)) return _modules
def tg(self): md = self.path.parent.joinpath(self.path.name + MD_SUFFIX) if not md.exists(): if hasattr(self._api, 'repos'): ddir = self._api.path('concepticondata') if self.local: md = ddir.joinpath('conceptlists', 'local' + MD_SUFFIX) if not md.exists(): md = ddir.joinpath('conceptlists', 'default' + MD_SUFFIX) else: md = Path(__file__).parent / 'conceptlist-metadata.json' tg = TableGroup.from_file(md) if isinstance(self._api, Path): tg._fname = self._api.parent.joinpath(self._api.name + MD_SUFFIX) tg.tables[0].url = Link('{0}.tsv'.format(self.id)) return tg
def validate(self, log=None, validators=None): validators = validators or [] validators.extend(VALIDATORS) success = True default_tg = TableGroup.from_file( pkg_path('modules', '{0}{1}'.format(self.module, MD_SUFFIX))) for default_table in default_tg.tables: dtable_uri = default_table.common_props['dc:conformsTo'] try: table = self[dtable_uri] except KeyError: log_or_raise('{0} requires {1}'.format(self.module, dtable_uri), log=log) success = False table = None if table: default_cols = { c.propertyUrl.uri for c in default_table.tableSchema.columns if c.required or c.common_props.get('dc:isRequiredBy')} cols = { c.propertyUrl.uri for c in table.tableSchema.columns if c.propertyUrl} table_uri = table.common_props['dc:conformsTo'] for col in default_cols - cols: log_or_raise('{0} requires column {1}'.format(table_uri, col), log=log) success = False for table in self.tables: type_uri = table.common_props.get('dc:conformsTo') if type_uri: try: TERMS.is_cldf_uri(type_uri) except ValueError: success = False log_or_raise('invalid CLDF URI: {0}'.format(type_uri), log=log) # FIXME: check whether table.common_props['dc:conformsTo'] is in validators! validators_ = [] for col in table.tableSchema.columns: if col.propertyUrl: col_uri = col.propertyUrl.uri try: TERMS.is_cldf_uri(col_uri) except ValueError: success = False log_or_raise('invalid CLDF URI: {0}'.format(col_uri), log=log) for table_, col_, v_ in validators: if (not table_ or table is self.get(table_)) and col is self.get((table, col_)): validators_.append((col, v_)) fname = Path(table.url.resolve(table._parent.base)) if fname.exists(): for fname, lineno, row in table.iterdicts(log=log, with_metadata=True): for col, validate in validators_: try: validate(self, table, col, row) except ValueError as e: log_or_raise( '{0}:{1}:{2} {3}'.format(fname.name, lineno, col.name, e), log=log) success = False if not table.check_primary_key(log=log): success = False if not self.tablegroup.check_referential_integrity(log=log): success = False return success
def validate(self, log=None, validators=None): validators = validators or [] validators.extend(VALIDATORS) success = True default_tg = TableGroup.from_file( pkg_path('modules', '{0}{1}'.format(self.module, MD_SUFFIX))) for default_table in default_tg.tables: dtable_uri = default_table.common_props['dc:conformsTo'] try: table = self[dtable_uri] except KeyError: log_or_raise('{0} requires {1}'.format(self.module, dtable_uri), log=log) success = False table = None if table: default_cols = { c.propertyUrl.uri for c in default_table.tableSchema.columns if c.required or c.common_props.get('dc:isRequiredBy')} cols = { c.propertyUrl.uri for c in table.tableSchema.columns if c.propertyUrl} table_uri = table.common_props['dc:conformsTo'] for col in default_cols - cols: log_or_raise('{0} requires column {1}'.format(table_uri, col), log=log) success = False for table in self.tables: type_uri = table.common_props.get('dc:conformsTo') if type_uri: try: TERMS.is_cldf_uri(type_uri) except ValueError: success = False log_or_raise('invalid CLDF URI: {0}'.format(type_uri), log=log) # FIXME: check whether table.common_props['dc:conformsTo'] is in validators! validators_ = [] for col in table.tableSchema.columns: if col.propertyUrl: col_uri = col.propertyUrl.uri try: TERMS.is_cldf_uri(col_uri) except ValueError: success = False log_or_raise('invalid CLDF URI: {0}'.format(col_uri), log=log) for table_, col_, v_ in validators: if (not table_ or table is self.get(table_)) and col is self.get((table, col_)): validators_.append((col, v_)) fname = Path(table.url.resolve(table._parent.base)) if fname.exists(): for fname, lineno, row in table.iterdicts(log=log, with_metadata=True): for col, validate in validators_: try: validate(self, table, col, row) except ValueError as e: log_or_raise( '{0}:{1}:{2} {3}'.format(fname.name, lineno, col.name, e), log=log) success = False if not table.check_primary_key(log=log): success = False else: log_or_raise('{0} does not exist'.format(fname), log=log) success = False if not self.tablegroup.check_referential_integrity(log=log): success = False return success
from csvw.metadata import TableGroup from lingpy import util from lingpy.convert.html import template_path # receive the template path from lingpy for splitstree tpath = util.Path(template_path('splitstree.nex')) if tpath.exists: _template = util.read_text_file(tpath.as_posix()) else: # pragma: no cover raise IOError("Unknown template %s" % template) tbg = TableGroup.from_file('cldf/StructureDataset-metadata.json') taxa = {t['ID']: (i, t['Name']) for i, t in enumerate(tbg.tabledict['languages.csv'])} params = {t['ID']: (i, t['Name']) for i, t in enumerate(tbg.tabledict['parameters.csv'])} matrix = [[0 for p in params] for t in taxa] for row in tbg.tabledict['values.csv']: tidx, tname = taxa[row['Language_ID']] pidx, pname = params[row['Parameter_ID']] if row['Value'] == '+': matrix[tidx][pidx] = 1 alpha = 'abcdefghijklmnopqrstuvwxyz' alpha += alpha.upper() alpha += '0123456789' matrix_string = '' tax_list = sorted([t[1] for t in taxa.items()], key=lambda x: x[0]) for i, line in enumerate(matrix): matrix_string += '{0:12}'.format(''.join([x for x in tax_list[i][1] if
def add_metadata(fname: Path, logger: cli.logging.Logger = cli.logger): if fname.name != "forms.csv": cli.Exit.CLI_ARGUMENT_ERROR( "A metadata-free Wordlist must be in a file called 'forms.csv'.") default_wordlist = TableGroup.from_file( pycldf.util.pkg_path("modules", "Wordlist-metadata.json")) default_wordlist._fname = fname.with_name("Wordlist-metadata.json") ds = pycldf.Wordlist(default_wordlist) # `from_data` checks that the reqired columns of the FormTable are present, # but it does not consolidate the columns further. colnames = next(iterrows(fname)) understood_colnames = { c.name for c in ds[ds.primary_table].tableSchema.columns if c.name in colnames } more_columns = { c.propertyUrl.uri: c for c in ds[ds.primary_table].tableSchema.columns if c.name not in understood_colnames } logger.info( "CLDF freely understood the columns %s in your forms.csv.", sorted(understood_colnames), ) # Consider the columns that were not understood. columns_without_metadata = set(colnames) - understood_colnames for column_name in columns_without_metadata: column: Column # Maybe they are known CLDF properties? if column_name in pycldf.terms.TERMS: column = pycldf.TERMS[column_name].to_column() # Maybe they are CLDF default column names? elif column_name in DEFAULT_NAME_COLUMNS: column = DEFAULT_NAME_COLUMNS[column_name] # Maybe they are columns that Lexedata knows to handle? elif column_name in LEXEDATA_COLUMNS: column = LEXEDATA_COLUMNS[column_name] # Maybe they are columns inherited from LingPy? elif column_name.upper() in LINGPY_COLUMNS: column = LINGPY_COLUMNS[column_name.upper()] # Maybe they are some name we have seen before? elif column_name in OTHER_KNOWN_COLUMNS: column = OTHER_KNOWN_COLUMNS[column_name] else: # TODO: Maybe they look like they have a specific type? ... # Otherwise, they are probably just text to be kept. column = Column( datatype=Datatype(base="string"), default="", null=[""], name=column_name, ) column.name = column_name ds[ds.primary_table].tableSchema.columns.append(column) summary = column.propertyUrl or column.datatype logger.info(f"Column {column_name} seems to be a {summary} column.") if column.propertyUrl: to_be_replaced = more_columns.pop(column.propertyUrl.uri, None) if to_be_replaced is not None: ds[ds.primary_table].tableSchema.columns.remove(to_be_replaced) for column in more_columns.values(): logger.info( f"Also added column {column.name}, as expected for a FormTable.") ds[ds.primary_table].tableSchema.columns.sort( key=lambda k: colnames.index(k.name) if k.name in colnames else 1e10) # TODO: Once lexedata is properly published, we can give a better URL. ds.properties["dc:contributor"] = [ "https://github.com/Anaphory/lexedata/blob/master/src/lexedata/edit/add_metadata.py" ] return ds
try: alignments = [ len( split_segments(data[value]['Alignment'])[ data[value]['Cognate_Sets'].index(key)]) for value in values ] if len(set(alignments)) != 1: errors += [key] except IndexError: errors += [key] return errors if __name__ == '__main__': tg = TableGroup.from_file('Wordlist-metadata.json') problems = [] count = 1 wordlist = {} for item in tg.tables[0]: morphemes = validate_morphemes(item) wordlist[item['ID']] = item struc = validate_structure(item) if not morphemes: problems += [[ count, 'morphemes', str(item['ID']), item['Language_Name'], item['Parameter_name'], ' '.join(item['Segments']) ]] count += 1 if not struc:
def tablegroup(self): return TableGroup.from_file(self.dir.joinpath(self.fname + '-metadata.json'))
def table(self): return TableGroup.from_file(self.path).tabledict[self.id + '.tsv']
def from_cldf(path, to=Wordlist, concept='Name', concepticon='Concepticon_ID', glottocode='Glottocode', language='Name' ): """ Load data from CLDF into a LingPy Wordlist object or similar. Parameters ---------- path : str The path to the metadata-file of your CLDF dataset. to : ~lingpy.basic.wordlist.Wordlist A ~lingpy.basic.wordlist.Wordlist object or one of the descendants (LexStat, Alignmnent). concept : str (default='gloss') The name used for the basic gloss in the `parameters.csv` table. glottocode : str (default='glottocode') The default name for the column storing the Glottolog ID in the `languages.csv` table. language : str (default='name') The default name for the language name in the `languages.csv` table. concepticon : str (default='conceptset') The default name for the concept set in the `paramters.csv` table. Notes ----- This function does not offer absolute flexibility regarding the data you can input so far. However, it can regularly read CLDF-formatted data into LingPy and thus allow you to use CLDF data in LingPy analyses. """ tbg = TableGroup.from_file(path) forms = tbg.tabledict['forms.csv'] # obtain the dictionaries to convert ids to values taxa = {t['ID']: (t[language], t[glottocode]) for t in tbg.tabledict['languages.csv']} concepts = {c['ID']: (c[concept], c[concepticon]) for c in tbg.tabledict['parameters.csv']} # create dictionary D = {} id2idx = {} for i, row in enumerate(forms): # check for numeric ID if row['ID'].isdigit(): idx = int(row['ID']) else: idx = i+1 id2idx[row['ID']] = idx doculect, glottocode = taxa[row['Language_ID']] concept, concepticon_id = concepts[row['Parameter_ID']] D[idx] = [doculect, glottocode, concept, concepticon_id] + [row.get(f, '') or '' for f in ['form_in_source', 'Form', 'Segments', 'Comment', 'Source']] # add the header D[0] = ['doculect', 'glottocode', 'concept', 'concepticon_id', 'value', 'form', 'tokens', 'note', 'source'] # convert to wordlist (simplifies handling) wordlist = to(D) # add cognates if they are needed and provided if 'cognates.csv' in tbg.tabledict: cognates = {id2idx[row['Form_ID']]: (row['Cognateset_ID'], row['Alignment']) for row in tbg.tabledict['cognates.csv']} if cognates: wordlist.add_entries('cogid', cognates, lambda x: x[0] or 0) wordlist.add_entries('alignment', cognates, lambda x: x[1] or '') return wordlist