Esempio n. 1
0
def test_integration():
    tg = TableGroup.from_file(FIXTURES / 'csv.txt-metadata.json')
    orig = tg.read()
    db = Database(tg)
    db.write_from_tg()
    for table, items in db.read().items():
        assert items == orig[table]
Esempio n. 2
0
    def write_languoids_table(self, outdir, version=None):
        version = version or self.describe()
        if outdir is not None and not outdir.exists():
            raise IOError("Specified output directory %s does not exist. Please create it." % outdir)
        out = outdir / 'glottolog-languoids-{0}.csv'.format(version)
        md = outdir / (out.name + '-metadata.json')
        tg = TableGroup.fromvalue({
            "@context": "http://www.w3.org/ns/csvw",
            "dc:version": version,
            "dc:": "Harald Hammarström, Robert Forkel & Martin Haspelmath. "
                   "clld/glottolog: Glottolog database (Version {0}) [Data set]. "
                   "Zenodo. http://doi.org/10.5281/zenodo.596479".format(version),
            "tables": [load(pycldf.util.pkg_path('components', 'LanguageTable-metadata.json'))],
        })
        tg.tables[0].url = out.name
        for col in [
            dict(name='LL_Code'),
            dict(name='Classification', separator='/'),
            dict(name='Family_Glottocode'),
            dict(name='Family_Name'),
            dict(name='Language_Glottocode'),
            dict(name='Language_Name'),
            dict(name='Level', datatype=dict(base='string', format='family|language|dialect')),
            dict(name='Status'),
        ]:
            tg.tables[0].tableSchema.columns.append(Column.fromvalue(col))

        langs = []
        for lang in self.languoids():
            lid, lname = None, None
            if lang.level == self.languoid_levels.language:
                lid, lname = lang.id, lang.name
            elif lang.level == self.languoid_levels.dialect:
                for lname, lid, level in reversed(lang.lineage):
                    if level == self.languoid_levels.language:
                        break
                else:  # pragma: no cover
                    raise ValueError
            langs.append(dict(
                ID=lang.id,
                Name=lang.name,
                Macroarea=lang.macroareas[0].name if lang.macroareas else None,
                Latitude=lang.latitude,
                Longitude=lang.longitude,
                Glottocode=lang.id,
                ISO639P3code=lang.iso,
                LL_Code=lang.identifier.get('multitree'),
                Classification=[c[1] for c in lang.lineage],
                Language_Glottocode=lid,
                Language_Name=lname,
                Family_Name=lang.lineage[0][0] if lang.lineage else None,
                Family_Glottocode=lang.lineage[0][1] if lang.lineage else None,
                Level=lang.level.name,
                Status=lang.endangerment.status.name if lang.endangerment else None,
            ))

        tg.to_file(md)
        tg.tables[0].write(langs, fname=out)
        return md, out
Esempio n. 3
0
def tg():
    return TableGroup.fromvalue(
        {'tables': [{
            'url': 'data',
            'tableSchema': {
                'columns': []
            }
        }]})
Esempio n. 4
0
def test_write_file_exists(tmpdir):
    target = pathlib.Path(str(tmpdir / 'db.sqlite3'))
    target.touch(exist_ok=False)
    mtime = target.stat().st_mtime
    tg = TableGroup.from_file(FIXTURES / 'csv.txt-metadata.json')
    db = Database(tg, fname=target)
    with pytest.raises(ValueError, match=r'already exists'):
        db.write()
    time.sleep(0.1)
    db.write(force=True)
    assert target.stat().st_mtime > mtime
Esempio n. 5
0
 def from_file(cls, fname, form=None):
     """
     Read an orthography profile from a metadata file or a default tab-separated profile file.
     """
     try:
         tg = TableGroup.from_file(fname)
         opfname = None
     except JSONDecodeError:
         tg = TableGroup.fromvalue(cls.MD)
         opfname = fname
     if len(tg.tables) != 1:
         raise ValueError(
             'profile description must contain exactly one table')
     metadata = tg.common_props
     metadata.update(fname=Path(fname), form=form)
     return cls(
         *[{
             k: None if (k != cls.GRAPHEME_COL and v == cls.NULL) else v
             for k, v in d.items()
         } for d in tg.tables[0].iterdicts(fname=opfname)], **metadata)
Esempio n. 6
0
    def __str__(self):
        """
        A Profile is represented as tab-separated lines of grapheme specifications.
        """
        tg = TableGroup.fromvalue(self.MD)
        for col in self.column_labels:
            if col != self.GRAPHEME_COL:
                tg.tables[0].tableSchema.columns.append(
                    Column.fromvalue({
                        "name": col,
                        "null": self.NULL
                    }))

        return tg.tables[0].write(self.iteritems(),
                                  fname=None).decode('utf8').strip()
Esempio n. 7
0
def test_extra_columns(tmpdir):
    tmpdir.join('md.json').write_text("""{
    "@context": ["http://www.w3.org/ns/csvw",{"@language": "en"}],
    "dialect": {"header": true,"encoding": "utf-8-sig"},
    "tables": [
        {"url": "csv.txt","tableSchema": {"columns": [{"name": "ID", "datatype": "string"}]}}
    ]
}
""",
                                      encoding='utf8')
    tmpdir.join('csv.txt').write_text('ID,extra\n1,ex', encoding='utf8')
    tg = TableGroup.from_file(str(tmpdir.join('md.json')))

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")

        db = Database(tg, fname=str(tmpdir.join('test.sqlite')))
        with pytest.raises(ValueError):
            db.write_from_tg()
        db.write_from_tg(_force=True, _skip_extra=True)
Esempio n. 8
0
    def write_languoids_table(self, outdir, version=None):
        version = version or self.describe()
        out = outdir / 'glottolog-languoids-{0}.csv'.format(version)
        md = outdir / (out.name + '-metadata.json')
        tg = TableGroup.fromvalue({
            "@context":
            "http://www.w3.org/ns/csvw",
            "dc:version":
            version,
            "dc:bibliographicCitation":
            "{0}. "
            "{1} [Data set]. "
            "Zenodo. https://doi.org/{2}".format(
                ' & '.join([e.name for e in self.current_editors]),
                self.publication.zenodo.title_format.format(
                    '(Version {0})'.format(version)),
                self.publication.zenodo.doi,
            ),
            "tables": [
                load(
                    pycldf.util.pkg_path('components',
                                         'LanguageTable-metadata.json'))
            ],
        })
        tg.tables[0].url = out.name
        for col in [
                dict(name='LL_Code'),
                dict(name='Classification', separator='/'),
                dict(name='Family_Glottocode'),
                dict(name='Family_Name'),
                dict(name='Language_Glottocode'),
                dict(name='Language_Name'),
                dict(name='Level',
                     datatype=dict(base='string',
                                   format='family|language|dialect')),
                dict(name='Status'),
        ]:
            tg.tables[0].tableSchema.columns.append(Column.fromvalue(col))

        langs = []
        for lang in self.languoids():
            lid, lname = None, None
            if lang.level == self.languoid_levels.language:
                lid, lname = lang.id, lang.name
            elif lang.level == self.languoid_levels.dialect:
                for lname, lid, level in reversed(lang.lineage):
                    if level == self.languoid_levels.language:
                        break
                else:  # pragma: no cover
                    raise ValueError
            langs.append(
                dict(
                    ID=lang.id,
                    Name=lang.name,
                    Macroarea=lang.macroareas[0].name
                    if lang.macroareas else None,
                    Latitude=lang.latitude,
                    Longitude=lang.longitude,
                    Glottocode=lang.id,
                    ISO639P3code=lang.iso,
                    LL_Code=lang.identifier.get('multitree'),
                    Classification=[c[1] for c in lang.lineage],
                    Language_Glottocode=lid,
                    Language_Name=lname,
                    Family_Name=lang.lineage[0][0] if lang.lineage else None,
                    Family_Glottocode=lang.lineage[0][1]
                    if lang.lineage else None,
                    Level=lang.level.name,
                    Status=lang.endangerment.status.name
                    if lang.endangerment else None,
                ))

        tg.to_file(md)
        tg.tables[0].write(langs, fname=out)
        return md, out
Esempio n. 9
0
    def cmd_makecldf(self, args):
        def parameter_id(s):
            return slug(s, lowercase=False)

        lcols = []
        for spec in LDATA.values():
            if isinstance(spec, tuple):
                col, dtype = spec
            else:
                col, dtype = spec, 'string'
            if dtype:
                lcols.append(dict(name=col, datatype=dtype))
        args.writer.cldf.add_component('LanguageTable', *lcols)
        args.writer.cldf.add_component(
            'ParameterTable', {
                'name': 'datatype',
                'datatype': {
                    'base': 'string',
                    'format': 'boolean|number|integer'
                }
            }, 'min', 'max')
        args.writer.cldf.remove_columns('ValueTable', 'Comment', 'Source')

        iso2gc = {l.iso: l.id for l in args.glottolog.api.languoids() if l.iso}
        dt_map = {
            r['Parameter_ID']: r['datatype']
            for r in self.etc_dir.read_csv('parameters.csv', dicts=True)
        }

        tg = TableGroup.from_file(self.raw_dir / 'metadata.json')
        colmap = {}
        seen = set()
        for col in tg.tables[0].tableSchema.columns:
            if col.header not in LDATA:
                pid = parameter_id(col.header)
                if pid not in seen:
                    col.datatype.base = dt_map.get(pid, col.datatype.base)
                    colmap[pid] = col.datatype
                    kw = {
                        'ID':
                        pid,
                        'Name':
                        col.header,
                        'Description':
                        col.common_props['dc:description'].strip() or None,
                        'datatype':
                        col.datatype.base,
                    }
                    if col.datatype.minimum is not None:
                        kw['min'] = col.datatype.minimum
                        kw['max'] = col.datatype.maximum
                    args.writer.objects['ParameterTable'].append(kw)
                    seen.add(pid)

        gc_map = {
            r['ID']: r['Glottocode']
            for r in self.etc_dir.read_csv('languages.csv', dicts=True)
        }
        country_map = {
            r['country']: r['alpha_2']
            for r in self.etc_dir.read_csv('countries.csv', dicts=True)
        }
        vals = {}
        for i, row in enumerate(tg.tables[0]):
            lid = row['ID']
            m = re.search('(?P<code>[a-z]{3})(-[0-9]+)?', row['ISO code']
                          or '')
            row['ISO code'] = m.group('code') if m else None
            kw = {
                'Glottocode': gc_map.get(row['ID'],
                                         iso2gc.get(row['ISO code']))
            }
            for k, v in LDATA.items():
                kw[v[0] if isinstance(v, tuple) else v] = row.pop(k)
            kw['Macroarea'] = 'Papunesia' if kw[
                'Macroarea'] == 'Pacific' else kw['Macroarea']
            if kw['Country']:
                if kw['Country'] in country_map:
                    kw['Country'] = country_map[kw['Country']] or None
                else:
                    kw['Country'] = pycountry.countries.lookup(
                        kw['Country']).alpha_2
            args.writer.objects['LanguageTable'].append(kw)

            for col, val in row.items():
                if val is None:
                    continue
                pid = parameter_id(col)
                dtype = colmap[pid]
                if dtype.base in ('integer', 'number'):
                    assert dtype.minimum <= val <= dtype.maximum
                elif dtype.base == 'boolean':
                    val = {True: 'yes', False: 'no'}[val]
                if (lid, pid) in vals:
                    assert vals[lid, pid] == val
                    continue
                vals[lid, pid] = val
                if col not in LDATA:
                    args.writer.objects['ValueTable'].append({
                        'ID':
                        '{0}-{1}'.format(kw['ID'], pid),
                        'Language_ID':
                        kw['ID'],
                        'Parameter_ID':
                        pid,
                        'Value':
                        val,
                    })
Esempio n. 10
0
    def __init__(self, id_):
        """
        :param system: The name of a transcription system or a directory containing one.
        """
        if hasattr(self, 'features'):
            # Only initialize, if this is really a new instance!
            return
        assert id_
        system = pkg_path('transcriptionsystems', id_)
        if not (system.exists() and system.is_dir()):
            raise ValueError('unknown system: {0}'.format(id_))

        self.system = TableGroup.from_file(
            pkg_path('transcriptionsystems',
                     'transcription-system-metadata.json'))
        self.system._fname = system / 'metadata.json'

        self.features = {'consonant': {}, 'vowel': {}, 'tone': {}}
        # dictionary for feature values, checks when writing elements from
        # write_order to make sure no output is doubled
        self._feature_values = {}

        # load the general features
        features = jsonlib.load(
            pkg_path('transcriptionsystems', 'features.json'))

        self.diacritics = dict(consonant={},
                               vowel={},
                               click={},
                               diphthong={},
                               tone={},
                               cluster={})
        for dia in itertable(self.system.tabledict['diacritics.tsv']):
            if not dia['alias'] and not dia['typography']:
                self.features[dia['type']][dia['value']] = dia['grapheme']
            # assign feature values to the dictionary
            self._feature_values[dia['value']] = dia['feature']
            self.diacritics[dia['type']][dia['grapheme']] = dia['value']

        self.sound_classes = {}
        self.columns = {}  # the basic column structure, to allow for rendering
        self.sounds = {}  # Sounds by grapheme
        self._covered = {}
        # check for unresolved aliased sounds
        aliases = []
        for cls in [Consonant, Vowel, Tone, Marker]:  # noqa: F405
            type_ = cls.__name__.lower()
            self.sound_classes[type_] = cls
            # store information on column structure to allow for rendering of a
            # sound in this form, which will make it easier to insert it when
            # finding generated sounds
            self.columns[type_] = [
                c['name'].lower()
                for c in self.system.tabledict['{0}s.tsv'.format(
                    type_)].asdict()['tableSchema']['columns']
            ]
            for l, item in enumerate(
                    itertable(
                        self.system.tabledict['{0}s.tsv'.format(type_)])):
                if item['grapheme'] in self.sounds:
                    raise ValueError(
                        'duplicate grapheme in {0}:{1}: {2}'.format(
                            type_ + 's.tsv', l + 2, item['grapheme']))
                sound = cls(ts=self, **item)
                # make sure this does not take too long
                for key, value in item.items():
                    if key not in {'grapheme', 'note', 'alias'} and \
                            value and value not in self._feature_values:
                        self._feature_values[value] = key
                        if type_ != 'marker' and value not in features[type_][
                                key]:
                            raise ValueError(
                                "Unrecognized features ({0}: {1}, line {2}))".
                                format(key, value, l + 2))

                self.sounds[item['grapheme']] = sound
                if not sound.alias:
                    if sound.featureset in self.features:
                        raise ValueError(
                            'duplicate features in {0}:{1}: {2}'.format(
                                type_ + 's.tsv', l + 2, sound.name))
                    self.features[sound.featureset] = sound
                else:
                    aliases += [(l, sound.type, sound.featureset)]
        # check for consistency of aliases: if an alias has no counterpart, it
        # is orphaned and needs to be deleted or given an accepted non-aliased
        # sound
        if [x for x in aliases
                if x[2] not in self.features]:  # pragma: no cover
            error = ', '.join(
                text_type(x[0] + 2) + '/' + text_type(x[1]) for x in aliases
                if x[2] not in self.features)
            raise ValueError('Orphaned aliases in line(s) {0}'.format(error))

        # basic regular expression, used to match the basic sounds in the system.
        self._regex = None
        self._update_regex()

        # normalization data
        self._normalize = {
            norm(r['source']): norm(r['target'])
            for r in itertable(self.system.tabledict['normalize.tsv'])
        }