def test_integration(): tg = TableGroup.from_file(FIXTURES / 'csv.txt-metadata.json') orig = tg.read() db = Database(tg) db.write_from_tg() for table, items in db.read().items(): assert items == orig[table]
def write_languoids_table(self, outdir, version=None): version = version or self.describe() if outdir is not None and not outdir.exists(): raise IOError("Specified output directory %s does not exist. Please create it." % outdir) out = outdir / 'glottolog-languoids-{0}.csv'.format(version) md = outdir / (out.name + '-metadata.json') tg = TableGroup.fromvalue({ "@context": "http://www.w3.org/ns/csvw", "dc:version": version, "dc:": "Harald Hammarström, Robert Forkel & Martin Haspelmath. " "clld/glottolog: Glottolog database (Version {0}) [Data set]. " "Zenodo. http://doi.org/10.5281/zenodo.596479".format(version), "tables": [load(pycldf.util.pkg_path('components', 'LanguageTable-metadata.json'))], }) tg.tables[0].url = out.name for col in [ dict(name='LL_Code'), dict(name='Classification', separator='/'), dict(name='Family_Glottocode'), dict(name='Family_Name'), dict(name='Language_Glottocode'), dict(name='Language_Name'), dict(name='Level', datatype=dict(base='string', format='family|language|dialect')), dict(name='Status'), ]: tg.tables[0].tableSchema.columns.append(Column.fromvalue(col)) langs = [] for lang in self.languoids(): lid, lname = None, None if lang.level == self.languoid_levels.language: lid, lname = lang.id, lang.name elif lang.level == self.languoid_levels.dialect: for lname, lid, level in reversed(lang.lineage): if level == self.languoid_levels.language: break else: # pragma: no cover raise ValueError langs.append(dict( ID=lang.id, Name=lang.name, Macroarea=lang.macroareas[0].name if lang.macroareas else None, Latitude=lang.latitude, Longitude=lang.longitude, Glottocode=lang.id, ISO639P3code=lang.iso, LL_Code=lang.identifier.get('multitree'), Classification=[c[1] for c in lang.lineage], Language_Glottocode=lid, Language_Name=lname, Family_Name=lang.lineage[0][0] if lang.lineage else None, Family_Glottocode=lang.lineage[0][1] if lang.lineage else None, Level=lang.level.name, Status=lang.endangerment.status.name if lang.endangerment else None, )) tg.to_file(md) tg.tables[0].write(langs, fname=out) return md, out
def tg(): return TableGroup.fromvalue( {'tables': [{ 'url': 'data', 'tableSchema': { 'columns': [] } }]})
def test_write_file_exists(tmpdir): target = pathlib.Path(str(tmpdir / 'db.sqlite3')) target.touch(exist_ok=False) mtime = target.stat().st_mtime tg = TableGroup.from_file(FIXTURES / 'csv.txt-metadata.json') db = Database(tg, fname=target) with pytest.raises(ValueError, match=r'already exists'): db.write() time.sleep(0.1) db.write(force=True) assert target.stat().st_mtime > mtime
def from_file(cls, fname, form=None): """ Read an orthography profile from a metadata file or a default tab-separated profile file. """ try: tg = TableGroup.from_file(fname) opfname = None except JSONDecodeError: tg = TableGroup.fromvalue(cls.MD) opfname = fname if len(tg.tables) != 1: raise ValueError( 'profile description must contain exactly one table') metadata = tg.common_props metadata.update(fname=Path(fname), form=form) return cls( *[{ k: None if (k != cls.GRAPHEME_COL and v == cls.NULL) else v for k, v in d.items() } for d in tg.tables[0].iterdicts(fname=opfname)], **metadata)
def __str__(self): """ A Profile is represented as tab-separated lines of grapheme specifications. """ tg = TableGroup.fromvalue(self.MD) for col in self.column_labels: if col != self.GRAPHEME_COL: tg.tables[0].tableSchema.columns.append( Column.fromvalue({ "name": col, "null": self.NULL })) return tg.tables[0].write(self.iteritems(), fname=None).decode('utf8').strip()
def test_extra_columns(tmpdir): tmpdir.join('md.json').write_text("""{ "@context": ["http://www.w3.org/ns/csvw",{"@language": "en"}], "dialect": {"header": true,"encoding": "utf-8-sig"}, "tables": [ {"url": "csv.txt","tableSchema": {"columns": [{"name": "ID", "datatype": "string"}]}} ] } """, encoding='utf8') tmpdir.join('csv.txt').write_text('ID,extra\n1,ex', encoding='utf8') tg = TableGroup.from_file(str(tmpdir.join('md.json'))) with warnings.catch_warnings(): warnings.simplefilter("ignore") db = Database(tg, fname=str(tmpdir.join('test.sqlite'))) with pytest.raises(ValueError): db.write_from_tg() db.write_from_tg(_force=True, _skip_extra=True)
def write_languoids_table(self, outdir, version=None): version = version or self.describe() out = outdir / 'glottolog-languoids-{0}.csv'.format(version) md = outdir / (out.name + '-metadata.json') tg = TableGroup.fromvalue({ "@context": "http://www.w3.org/ns/csvw", "dc:version": version, "dc:bibliographicCitation": "{0}. " "{1} [Data set]. " "Zenodo. https://doi.org/{2}".format( ' & '.join([e.name for e in self.current_editors]), self.publication.zenodo.title_format.format( '(Version {0})'.format(version)), self.publication.zenodo.doi, ), "tables": [ load( pycldf.util.pkg_path('components', 'LanguageTable-metadata.json')) ], }) tg.tables[0].url = out.name for col in [ dict(name='LL_Code'), dict(name='Classification', separator='/'), dict(name='Family_Glottocode'), dict(name='Family_Name'), dict(name='Language_Glottocode'), dict(name='Language_Name'), dict(name='Level', datatype=dict(base='string', format='family|language|dialect')), dict(name='Status'), ]: tg.tables[0].tableSchema.columns.append(Column.fromvalue(col)) langs = [] for lang in self.languoids(): lid, lname = None, None if lang.level == self.languoid_levels.language: lid, lname = lang.id, lang.name elif lang.level == self.languoid_levels.dialect: for lname, lid, level in reversed(lang.lineage): if level == self.languoid_levels.language: break else: # pragma: no cover raise ValueError langs.append( dict( ID=lang.id, Name=lang.name, Macroarea=lang.macroareas[0].name if lang.macroareas else None, Latitude=lang.latitude, Longitude=lang.longitude, Glottocode=lang.id, ISO639P3code=lang.iso, LL_Code=lang.identifier.get('multitree'), Classification=[c[1] for c in lang.lineage], Language_Glottocode=lid, Language_Name=lname, Family_Name=lang.lineage[0][0] if lang.lineage else None, Family_Glottocode=lang.lineage[0][1] if lang.lineage else None, Level=lang.level.name, Status=lang.endangerment.status.name if lang.endangerment else None, )) tg.to_file(md) tg.tables[0].write(langs, fname=out) return md, out
def cmd_makecldf(self, args): def parameter_id(s): return slug(s, lowercase=False) lcols = [] for spec in LDATA.values(): if isinstance(spec, tuple): col, dtype = spec else: col, dtype = spec, 'string' if dtype: lcols.append(dict(name=col, datatype=dtype)) args.writer.cldf.add_component('LanguageTable', *lcols) args.writer.cldf.add_component( 'ParameterTable', { 'name': 'datatype', 'datatype': { 'base': 'string', 'format': 'boolean|number|integer' } }, 'min', 'max') args.writer.cldf.remove_columns('ValueTable', 'Comment', 'Source') iso2gc = {l.iso: l.id for l in args.glottolog.api.languoids() if l.iso} dt_map = { r['Parameter_ID']: r['datatype'] for r in self.etc_dir.read_csv('parameters.csv', dicts=True) } tg = TableGroup.from_file(self.raw_dir / 'metadata.json') colmap = {} seen = set() for col in tg.tables[0].tableSchema.columns: if col.header not in LDATA: pid = parameter_id(col.header) if pid not in seen: col.datatype.base = dt_map.get(pid, col.datatype.base) colmap[pid] = col.datatype kw = { 'ID': pid, 'Name': col.header, 'Description': col.common_props['dc:description'].strip() or None, 'datatype': col.datatype.base, } if col.datatype.minimum is not None: kw['min'] = col.datatype.minimum kw['max'] = col.datatype.maximum args.writer.objects['ParameterTable'].append(kw) seen.add(pid) gc_map = { r['ID']: r['Glottocode'] for r in self.etc_dir.read_csv('languages.csv', dicts=True) } country_map = { r['country']: r['alpha_2'] for r in self.etc_dir.read_csv('countries.csv', dicts=True) } vals = {} for i, row in enumerate(tg.tables[0]): lid = row['ID'] m = re.search('(?P<code>[a-z]{3})(-[0-9]+)?', row['ISO code'] or '') row['ISO code'] = m.group('code') if m else None kw = { 'Glottocode': gc_map.get(row['ID'], iso2gc.get(row['ISO code'])) } for k, v in LDATA.items(): kw[v[0] if isinstance(v, tuple) else v] = row.pop(k) kw['Macroarea'] = 'Papunesia' if kw[ 'Macroarea'] == 'Pacific' else kw['Macroarea'] if kw['Country']: if kw['Country'] in country_map: kw['Country'] = country_map[kw['Country']] or None else: kw['Country'] = pycountry.countries.lookup( kw['Country']).alpha_2 args.writer.objects['LanguageTable'].append(kw) for col, val in row.items(): if val is None: continue pid = parameter_id(col) dtype = colmap[pid] if dtype.base in ('integer', 'number'): assert dtype.minimum <= val <= dtype.maximum elif dtype.base == 'boolean': val = {True: 'yes', False: 'no'}[val] if (lid, pid) in vals: assert vals[lid, pid] == val continue vals[lid, pid] = val if col not in LDATA: args.writer.objects['ValueTable'].append({ 'ID': '{0}-{1}'.format(kw['ID'], pid), 'Language_ID': kw['ID'], 'Parameter_ID': pid, 'Value': val, })
def __init__(self, id_): """ :param system: The name of a transcription system or a directory containing one. """ if hasattr(self, 'features'): # Only initialize, if this is really a new instance! return assert id_ system = pkg_path('transcriptionsystems', id_) if not (system.exists() and system.is_dir()): raise ValueError('unknown system: {0}'.format(id_)) self.system = TableGroup.from_file( pkg_path('transcriptionsystems', 'transcription-system-metadata.json')) self.system._fname = system / 'metadata.json' self.features = {'consonant': {}, 'vowel': {}, 'tone': {}} # dictionary for feature values, checks when writing elements from # write_order to make sure no output is doubled self._feature_values = {} # load the general features features = jsonlib.load( pkg_path('transcriptionsystems', 'features.json')) self.diacritics = dict(consonant={}, vowel={}, click={}, diphthong={}, tone={}, cluster={}) for dia in itertable(self.system.tabledict['diacritics.tsv']): if not dia['alias'] and not dia['typography']: self.features[dia['type']][dia['value']] = dia['grapheme'] # assign feature values to the dictionary self._feature_values[dia['value']] = dia['feature'] self.diacritics[dia['type']][dia['grapheme']] = dia['value'] self.sound_classes = {} self.columns = {} # the basic column structure, to allow for rendering self.sounds = {} # Sounds by grapheme self._covered = {} # check for unresolved aliased sounds aliases = [] for cls in [Consonant, Vowel, Tone, Marker]: # noqa: F405 type_ = cls.__name__.lower() self.sound_classes[type_] = cls # store information on column structure to allow for rendering of a # sound in this form, which will make it easier to insert it when # finding generated sounds self.columns[type_] = [ c['name'].lower() for c in self.system.tabledict['{0}s.tsv'.format( type_)].asdict()['tableSchema']['columns'] ] for l, item in enumerate( itertable( self.system.tabledict['{0}s.tsv'.format(type_)])): if item['grapheme'] in self.sounds: raise ValueError( 'duplicate grapheme in {0}:{1}: {2}'.format( type_ + 's.tsv', l + 2, item['grapheme'])) sound = cls(ts=self, **item) # make sure this does not take too long for key, value in item.items(): if key not in {'grapheme', 'note', 'alias'} and \ value and value not in self._feature_values: self._feature_values[value] = key if type_ != 'marker' and value not in features[type_][ key]: raise ValueError( "Unrecognized features ({0}: {1}, line {2}))". format(key, value, l + 2)) self.sounds[item['grapheme']] = sound if not sound.alias: if sound.featureset in self.features: raise ValueError( 'duplicate features in {0}:{1}: {2}'.format( type_ + 's.tsv', l + 2, sound.name)) self.features[sound.featureset] = sound else: aliases += [(l, sound.type, sound.featureset)] # check for consistency of aliases: if an alias has no counterpart, it # is orphaned and needs to be deleted or given an accepted non-aliased # sound if [x for x in aliases if x[2] not in self.features]: # pragma: no cover error = ', '.join( text_type(x[0] + 2) + '/' + text_type(x[1]) for x in aliases if x[2] not in self.features) raise ValueError('Orphaned aliases in line(s) {0}'.format(error)) # basic regular expression, used to match the basic sounds in the system. self._regex = None self._update_regex() # normalization data self._normalize = { norm(r['source']): norm(r['target']) for r in itertable(self.system.tabledict['normalize.tsv']) }