Ejemplo n.º 1
0
    def test_BibFile(self):
        bibfile = self.api.bibfiles['a.bib']
        self.assertEqual(bibfile['a:key'].type, 'misc')
        self.assertEqual(bibfile['s:Andalusi:Turk'].key, 's:Andalusi:Turk')

        for entry in bibfile.iterentries():
            if entry.key == 'key':
                self.assertEqual(len(list(entry.languoids({'abc': 1})[0])), 1)

        with self.assertRaises(KeyError):
            bibfile['xyz']

        self.assertEqual(len(list(bibfile.iterentries())), 3)

        lines = [line for line in read_text(bibfile.fname).split('\n')
                 if not line.strip().startswith('glottolog_ref_id')]
        write_text(self.tmp_path('a.bib'), '\n'.join(lines))
        bibfile.update(self.tmp_path('a.bib'))
        self.assertEqual(len(list(bibfile.iterentries())), 3)

        bibfile.update(self.api.bibfiles['b.bib'].fname)
        self.assertEqual(len(list(bibfile.iterentries())), 1)

        def visitor(entry):
            entry.fields['new_field'] = 'a'

        bibfile.visit(visitor=visitor)
        for entry in bibfile.iterentries():
            self.assertIn('new_field', entry.fields)

        bibfile.visit(visitor=lambda e: True)
        self.assertEqual(len(bibfile.keys()), 0)
Ejemplo n.º 2
0
def app(args):  # pragma: no cover
    """
    Dumps Concepticon's contents for English, German, Chinese, and French.

    Notes
    -----
    Data are by default dumped into a structured JSON file in html/data.js.

    Examples
    --------
    $ concepticon html
    """
    data = defaultdict(list)

    def key(g, l):
        return '{0}---{1}'.format(g, l)

    for lang in ['en', 'de', 'zh', 'fr', 'ru', 'es', 'pt']:
        for cidx, gloss in args.api._get_map_for_language(lang):
            g0, _, g1 = gloss.partition('///')
            csspec = (
                cidx,
                args.api.conceptsets[cidx].gloss,
                args.api.conceptsets[cidx].definition,
                args.api.conceptsets[cidx].ontological_category)
            data[key(g1, lang)].append(csspec)
            if lang == 'en':
                data[key(g0, lang)].append(csspec)
                data[key(g0.lower(), lang)].append(csspec)
    data['language'] = 'en'
    write_text(
        args.api.appdatadir.joinpath('data.js'),
        'var Concepticon = {0};\n'.format(json.dumps(data, indent=2)))
    args.log.info('app data recreated')
Ejemplo n.º 3
0
def test_Dataset_from_scratch(tmpdir, data):
    # An unknown file name cannot be used with Dataset.from_data:
    copy(str(data / 'ds1.csv'), str(tmpdir / 'xyz.csv'))
    with pytest.raises(ValueError):
        Dataset.from_data(str(tmpdir / 'xyz.csv'))

    # Known file name, but non-standard column name:
    write_text(str(tmpdir / 'values.csv'), "IDX,Language_ID,Parameter_ID,Value\n1,1,1,1")
    with pytest.raises(ValueError, match='missing columns'):
        ds = Dataset.from_data(str(tmpdir / 'values.csv'))

    # A known file name will determine the CLDF module of the dataset:
    copy(str(data / 'ds1.csv'), str(tmpdir / 'values.csv'))
    ds = Dataset.from_data(str(tmpdir / 'values.csv'))
    assert ds.module == 'StructureDataset'

    assert len(list(ds['ValueTable'])) == 2
    ds.validate()
    ds['ValueTable'].write(2 * list(ds['ValueTable']))
    with pytest.raises(ValueError):
        ds.validate()
    md = ds.write_metadata()
    Dataset.from_metadata(md)
    repr(ds)
    del ds.tablegroup.common_props['dc:conformsTo']
    Dataset.from_metadata(ds.write_metadata())
    assert len(ds.stats()) == 1

    ds.add_table('extra.csv', 'ID')
    ds.write(**{'ValueTable': [], 'extra.csv': []})
    counts = {r[0]: r[2] for r in ds.stats()}
    assert counts['extra.csv'] == 0
Ejemplo n.º 4
0
def test_Dataset_from_scratch(tmpdir, data):
    # An unknown file name cannot be used with Dataset.from_data:
    copy(str(data / 'ds1.csv'), str(tmpdir / 'xyz.csv'))
    with pytest.raises(ValueError):
        Dataset.from_data(str(tmpdir / 'xyz.csv'))

    # Known file name, but non-standard column name:
    write_text(str(tmpdir / 'values.csv'),
               "IDX,Language_ID,Parameter_ID,Value\n1,1,1,1")
    with pytest.raises(ValueError, match='missing columns'):
        ds = Dataset.from_data(str(tmpdir / 'values.csv'))

    # A known file name will determine the CLDF module of the dataset:
    copy(str(data / 'ds1.csv'), str(tmpdir / 'values.csv'))
    ds = Dataset.from_data(str(tmpdir / 'values.csv'))
    assert ds.module == 'StructureDataset'

    assert len(list(ds['ValueTable'])) == 2
    ds.validate()
    ds['ValueTable'].write(2 * list(ds['ValueTable']))
    with pytest.raises(ValueError):
        ds.validate()
    md = ds.write_metadata()
    Dataset.from_metadata(md)
    repr(ds)
    del ds.tablegroup.common_props['dc:conformsTo']
    Dataset.from_metadata(ds.write_metadata())
    assert len(ds.stats()) == 1
Ejemplo n.º 5
0
def test_encoding(tmppath):
    ini = tmppath / 'test.ini'
    write_text(ini, '[äöü]\näöü = äöü', encoding='cp1252')

    with pytest.raises(UnicodeDecodeError):
        INI.from_file(ini)

    assert INI.from_file(ini, encoding='cp1252')['äöü']['äöü'] == 'äöü'
Ejemplo n.º 6
0
def new_dataset(args):
    """
    lexibank new-dataset OUTDIR [ID]
    """
    if not args.args:
        raise ParserError('you must specify an existing directory')
    outdir = Path(args.args.pop(0))
    if not outdir.exists():
        raise ParserError('you must specify an existing directory')

    id_pattern = re.compile('[a-z_0-9]+$')
    md = {}
    if args.args:
        md['id'] = args.args.pop(0)
    else:
        md['id'] = input('Dataset ID: ')

    while not id_pattern.match(md['id']):
        print(
            'dataset id must only consist of lowercase ascii letters, digits and _ (underscore)!'
        )
        md['id'] = input('Dataset ID: ')

    outdir = outdir / md['id']
    if not outdir.exists():
        outdir.mkdir()

    for key in ['title', 'url', 'license', 'conceptlist', 'citation']:
        md[key] = input('Dataset {0}: '.format(key))

    # check license!
    # check conceptlist!

    for path in Path(
            pylexibank.__file__).parent.joinpath('dataset_template').iterdir():
        if path.is_file():
            if path.suffix in ['.pyc']:
                continue  # pragma: no cover
            target = path.name
            content = read_text(path)
            if '+' in path.name:
                target = re.sub('\+([a-z]+)\+',
                                lambda m: '{' + m.groups()[0] + '}',
                                path.name).format(**md)
            if target.endswith('_tmpl'):
                target = target[:-5]
                content = content.format(**md)
            write_text(outdir / target, content)
        else:
            target = outdir / path.name
            if target.exists():
                shutil.rmtree(str(target))
            shutil.copytree(str(path), str(target))
    del md['id']
    jsonlib.dump(md, outdir / 'metadata.json', indent=4)
Ejemplo n.º 7
0
def readme(args):
    md = ['# Sources', '']
    for datatype in ['datasets', 'phylogenies']:
        md.append('\n## {0}\n'.format(datatype.capitalize()))
        t = Table('Name', 'Reference')
        for obj in getattr(args.repos, datatype):
            if not obj.id.startswith(
                    'glottolog_') or obj.id == 'glottolog_global':
                t.append([
                    '[{0}]({1}/{2})'.format(obj.name, datatype, obj.id),
                    obj.reference
                ])
        md.append(t.render(condensed=False))
    write_text(args.repos.dir.joinpath('SOURCES.md'), '\n'.join(md))
Ejemplo n.º 8
0
def dump(version, all_langs, identifiers):
    out = args.data_file('files', 'glottolog-{0}'.format(version))
    if out.exists():
        for p in out.iterdir():
            remove(p)
    else:
        out.mkdir()

    langs = all_langs[version].values()
    langs_by_pk = {l.pk: l for l in langs}

    children = {
        pk: list(c)
        for pk, c in groupby(sorted(langs, key=lambda l: l.fpk),
                             lambda l: l.fpk)
    }

    for lang in langs:
        ancestors, fpk = [], lang.fpk
        while fpk and fpk in langs_by_pk:
            ancestors.append(langs_by_pk[fpk])
            fpk = langs_by_pk[fpk].fpk

        versions = [
            '<strong><a href="http://glottolog.org/resource/languoid/id/{0}">[{0}] in current Glottolog</a></strong>'
            .format(lang.id)
        ]
        for v in sorted(all_langs.keys()):
            if v != version:
                if lang.id in all_langs[v]:
                    versions.append(all_langs[v][lang.id].cross_version_link)
        clf = [link_list(children.get(lang.pk, []))]
        clf.append(lang.text)
        clf.extend(a.link for a in ancestors)
        write_text(
            out.joinpath('{0}.html'.format(lang.id)),
            T.render_unicode(
                version=version,
                lang=lang,
                clf=reduce(wrap, clf) if not lang.replacements else '',
                versions=versions,
                identifiers=identifiers.get(lang.pk, []),
                replacements=[
                    all_langs[version][lid].link for lid in lang.replacements
                    if lid in all_langs[version]
                ],
                wrap=wrap,
                link_list=link_list,
            ))
Ejemplo n.º 9
0
def newick(args):
    from pyglottolog.languoids import Level
    nodes = collections.OrderedDict((l.id, l) for l in args.repos.languoids())
    trees = []
    for lang in nodes.values():
        if not lang.lineage and not lang.category.startswith('Pseudo '):
            ns = lang.newick_node(nodes=nodes).newick
            if lang.level == Level.language and not ns.startswith('('):
                # an isolate without dialects: we wrap it in a pseudo-family with the
                # same name and ID.
                ns = '({0}){0}'.format(ns)
            trees.append('{0};'.format(ns))
    fname = args.pkg_dir.joinpath('static', 'download', 'tree-glottolog-newick.txt')
    write_text(fname, '\n'.join(trees))
    args.log.info('{0} written'.format(fname))
Ejemplo n.º 10
0
def test_read_write(tmppath):
    from clldutils.path import read_text, write_text

    text = 'äöüß'
    p = tmppath / 'test'
    assert write_text(p, text) == len(text)
    assert read_text(p) == text
Ejemplo n.º 11
0
def newick(args):
    from pyglottolog.languoids import Level
    nodes = collections.OrderedDict((l.id, l) for l in args.repos.languoids())
    trees = []
    for lang in nodes.values():
        if not lang.lineage and not lang.category.startswith('Pseudo '):
            ns = lang.newick_node(nodes=nodes).newick
            if lang.level == Level.language and not ns.startswith('('):
                # an isolate without dialects: we wrap it in a pseudo-family with the
                # same name and ID.
                ns = '({0}){0}'.format(ns)
            trees.append('{0};'.format(ns))
    fname = args.pkg_dir.joinpath('static', 'download',
                                  'tree-glottolog-newick.txt')
    write_text(fname, '\n'.join(trees))
    args.log.info('{0} written'.format(fname))
Ejemplo n.º 12
0
def write_valid_soundfilepaths(args):
    """
    Creates the file 'valid_soundfilepaths.txt' containig all valid
    sound file paths based on database data.
    """
    db = _db(args)
    api = _api(args)
    # make sure all data will be concatenated
    db("SET @@group_concat_max_len = 4096")
    query = """
SELECT 
concat(L.FilePathPart,"/",L.FilePathPart, W.SoundFileWordIdentifierText) as P
FROM Words AS W, Languages AS L
WHERE 
L.study <> 'Europe' AND W.study <> 'Europe' AND
L.study = W.study
UNION
SELECT
concat(
L.FilePathPart,"/",L.FilePathPart,
W.SoundFileWordIdentifierText,
case
	when T.AlternativeLexemIx > 1 and T.AlternativePhoneticRealisationIx = 0 then concat("_lex", T.AlternativeLexemIx)
	when T.AlternativeLexemIx = 0 and T.AlternativePhoneticRealisationIx > 1 then concat("_pron", T.AlternativePhoneticRealisationIx)
	when T.AlternativeLexemIx > 1 and T.AlternativePhoneticRealisationIx > 1 then concat("_lex", T.AlternativeLexemIx,"_pron", T.AlternativePhoneticRealisationIx)
	else ""
end
) as P
FROM Transcriptions AS T, Words AS W, Languages AS L
WHERE
L.study <> 'Europe' AND T.study <> 'Europe' AND W.study <> 'Europe' AND
L.`LanguageIx` = T.`LanguageIx`
AND
W.`IxElicitation` = T.`IxElicitation`
AND
W.IxMorphologicalInstance = T.IxMorphologicalInstance
AND
L.study = W.study
ORDER BY 1 ASC
    """
    data = list(db(query))
    valid_snd_file_names = set()
    for row in data:
        valid_snd_file_names.add(row['P'])
    write_text(api.repos / 'soundfiles' / 'valid_soundfilepaths.txt',
        '\n'.join(sorted(valid_snd_file_names, key=lambda s: s.lower())))
Ejemplo n.º 13
0
def read_url(path, cache_dir=None, log=None):
    """
    Delegate scraping to clldutils, since nowadays this requires tweaking the user agent as well.
    """
    if cache_dir:
        cache_dir = Path(cache_dir)
        if log:  # pragma: no cover
            log.debug('retrieving {0} ...'.format(path))
        fpath = cache_dir / hashlib.md5(path.encode('utf8')).hexdigest()
        if not fpath.exists():
            with iso_639_3._open(path) as fp:
                write_text(fpath, fp.read().decode('utf8'))
        else:  # pragma: no cover
            if log:
                log.debug('... from cache {0}'.format(fpath))
        return read_text(fpath)

    with iso_639_3._open(path) as fp:
        return fp.read().decode('utf8')
Ejemplo n.º 14
0
def dump(out, version, all_langs, identifiers):
    if out.exists():
        for p in out.iterdir():
            remove(p)
    else:
        out.mkdir()

    langs = all_langs[version].values()
    langs_by_pk = {l.pk: l for l in langs}

    children = {
        pk: list(c)
        for pk, c in groupby(sorted(langs, key=lambda l: l.fpk or 0), lambda l: l.fpk)}

    for lang in langs:
        ancestors, fpk = [], lang.fpk
        while fpk and fpk in langs_by_pk:
            ancestors.append(langs_by_pk[fpk])
            fpk = langs_by_pk[fpk].fpk

        versions = [
            '<strong><a href="http://glottolog.org/resource/languoid/id/{0}">[{0}] in current Glottolog</a></strong>'.format(lang.id)]
        for v in sorted(all_langs.keys()):
            if v != version:
                if lang.id in all_langs[v]:
                    versions.append(all_langs[v][lang.id].cross_version_link)
        clf = [link_list(children.get(lang.pk, []))]
        clf.append(lang.text)
        clf.extend(a.link for a in ancestors)
        write_text(
            out.joinpath('{0}.html'.format(lang.id)),
            T.render_unicode(
                version=version,
                lang=lang,
                clf=reduce(wrap, clf) if not lang.replacements else '',
                versions=versions,
                identifiers=identifiers.get(lang.pk, []),
                replacements=[all_langs[version][lid].link for lid in lang.replacements
                              if lid in all_langs[version]],
                wrap=wrap,
                link_list=link_list,
            )
        )
Ejemplo n.º 15
0
def test_BibFile(tmpdir, bibfiles):
    bf = bibfiles['a.bib']
    assert bf['a:key'].type == 'misc'
    assert bf['s:Andalusi:Turk'].key == 's:Andalusi:Turk'

    for entry in bf.iterentries():
        if entry.key == 'key':
            assert len(list(entry.languoids({'abc': 1})[0])) == 1

    with pytest.raises(KeyError):
        bf['xyz']

    assert len(list(bf.iterentries())) == 3

    lines = [line for line in read_text(bf.fname).split('\n')
             if not line.strip().startswith('glottolog_ref_id')]
    write_text(str(tmpdir / 'a.bib'), '\n'.join(lines))

    entries = bf.load()  # FIXME
    bf.fname = str(tmpdir / ' newa.bib')
    bf.save(entries)

    bf.update(str(tmpdir / 'a.bib'))
    assert len(list(bf.iterentries())) == 3

    bf.update(bibfiles['b.bib'].fname)
    assert len(list(bf.iterentries())) == 1

    def visitor(entry):
        entry.fields['new_field'] = 'a'

    bf.visit(visitor=visitor)
    for entry in bf.iterentries():
        assert 'new_field' in entry.fields

    bf.visit(visitor=lambda e: True)
    assert len(bf.keys()) == 0
Ejemplo n.º 16
0
def app(args):  # pragma: no cover
    """
    Dumps Concepticon's contents for English, German, Chinese, and French.

    Notes
    -----
    Data are by default dumped into a structured JSON file in html/data.js.

    Examples
    --------
    $ concepticon html
    """
    data = defaultdict(list)

    def key(g, l):
        return "{0}---{1}".format(g, l)

    for lang in ["en", "de", "zh", "fr", "ru", "es", "pt"]:
        for cidx, gloss in args.api._get_map_for_language(lang):
            g0, _, g1 = gloss.partition("///")
            csspec = (
                cidx,
                args.api.conceptsets[cidx].gloss,
                args.api.conceptsets[cidx].definition,
                args.api.conceptsets[cidx].ontological_category,
            )
            data[key(g1, lang)].append(csspec)
            if lang == "en":
                data[key(g0, lang)].append(csspec)
                data[key(g0.lower(), lang)].append(csspec)
    data["language"] = "en"
    write_text(
        args.api.appdatadir.joinpath("data.js"),
        "var Concepticon = {0};\n".format(json.dumps(data, indent=2)),
    )
    args.log.info("app data recreated")
Ejemplo n.º 17
0
    def _install(self, **kw):
        self.log = kw.get('log', self.log)
        self.unmapped.clear()
        for p in self.cldf_dir.iterdir():
            if p.name not in ['README.md', '.gitattributes']:
                p.unlink()
        self.tr_analyses = {}
        self.tr_bad_words = []
        self.tr_invalid_words = []

        if len(self.metadata.conceptlist):
            self.conceptlist = self.concepticon.conceptlists[
                self.metadata.conceptlist[0]]
        if self.cmd_install(**kw) == NOOP:
            return

        if self.metadata.known_license:
            legalcode = self.metadata.known_license.legalcode
            if legalcode:
                write_text(self.dir / 'LICENSE', legalcode)

        gitattributes = self.cldf_dir / '.gitattributes'
        if not gitattributes.exists():
            with gitattributes.open('wt') as fp:
                fp.write('*.csv text eol=crlf')

        if kw.get('verbose'):
            self.unmapped.pprint()
        self.cldf.validate(kw['log'])

        stats = transcription.Stats(
            bad_words=sorted(self.tr_bad_words[:100], key=lambda x: x['ID']),
            bad_words_count=len(self.tr_bad_words),
            invalid_words=sorted(self.tr_invalid_words[:100],
                                 key=lambda x: x['ID']),
            invalid_words_count=len(self.tr_invalid_words))
        for lid, analysis in self.tr_analyses.items():
            for attribute in [
                    'segments', 'bipa_errors', 'sclass_errors', 'replacements'
            ]:
                getattr(stats, attribute).update(getattr(analysis, attribute))
            stats.general_errors += analysis.general_errors
            stats.inventory_size += len(analysis.segments) / len(
                self.tr_analyses)

        error_segments = stats.bipa_errors.union(stats.sclass_errors)
        for i, row in enumerate(stats.bad_words):
            analyzed_segments = []
            for s in row['Segments']:
                analyzed_segments.append('<s> %s </s>' %
                                         s if s in error_segments else s)
            stats.bad_words[i] = [
                row['ID'], row['Language_ID'], row['Parameter_ID'],
                row['Form'], ' '.join(analyzed_segments)
            ]

        for i, row in enumerate(stats.invalid_words):
            stats.invalid_words[i] = [
                row['ID'], row['Language_ID'], row['Parameter_ID'], row['Form']
            ]
        # Aggregate transcription analysis results ...
        tr = dict(by_language={
            k: attr.asdict(v)
            for k, v in self.tr_analyses.items()
        },
                  stats=attr.asdict(stats))
        # ... and write a report:
        for text, fname in [
            (transcription.report(tr), 'TRANSCRIPTION.md'),
            (self.report(tr, log=kw.get('log')), 'README.md'),
        ]:
            textdump(text, self.dir / fname, log=kw.get('log'))
Ejemplo n.º 18
0
def main(scripts, dev, glr):
    cldf_dir = Path('cldf')
    bib = parse_string(read_text(cldf_dir / 'sources.bib'),
                       bib_format='bibtex')
    write_text(cldf_dir / 'sources.bib', bib.lower().to_string('bibtex'))

    glottolog = Glottolog(glr)
    ds = StructureDataset.in_dir(cldf_dir)
    ds.tablegroup.notes.append(
        OrderedDict([('dc:title', 'environment'),
                     ('properties',
                      OrderedDict([
                          ('glottolog_version', git_describe(glottolog.repos)),
                      ]))]))
    ds.add_columns('ValueTable', {
        'name': 'Marginal',
        'datatype': 'boolean'
    }, {
        'name': 'Allophones',
        'separator': ' '
    }, 'Contribution_ID')
    features = [
        "tone", "stress", "syllabic", "short", "long", "consonantal",
        "sonorant", "continuant", "delayedRelease", "approximant", "tap",
        "trill", "nasal", "lateral", "labial", "round", "labiodental",
        "coronal", "anterior", "distributed", "strident", "dorsal", "high",
        "low", "front", "back", "tense", "retractedTongueRoot",
        "advancedTongueRoot", "periodicGlottalSource", "epilaryngealSource",
        "spreadGlottis", "constrictedGlottis", "fortis",
        "raisedLarynxEjective", "loweredLarynxImplosive", "click"
    ]
    ds.add_component('ParameterTable', 'SegmentClass', *features)
    ds.add_component('LanguageTable')
    ds.add_table(
        'contributions.csv', 'ID', 'Name', 'Contributor_ID', {
            'name': 'Source',
            'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source',
            'separator': ';'
        }, 'URL')
    ds.add_table(
        'contributors.csv',
        'ID',
        'Name',
        'Description',
        'Readme',
        'Contents',
        {
            'name': 'Source',
            'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source',
            'separator': ';'
        },
        'URL',
    )

    def read(what):
        return reader(scripts / 'to_cldf' / 'cldf' / what, namedtuples=True)

    languoids = {l.id: l for l in glottolog.languoids()}

    values, segments, languages, inventories, sources = [], [], {}, {}, []
    for contrib in read('contributors.csv'):
        sources.append(
            dict(
                ID=contrib.Name,
                Name=contrib.Contributor,
                Description=contrib.Description,
                Readme=desc(dev, contrib.Name),
                Contents=contrib.Contents,
                Source=[
                    c.strip().lower() for c in contrib.Citation.split(';')
                ],
                URL=contrib.SourceURL if contrib.SourceURL != 'NA' else '',
            ))

    pid_map = {}
    for row in read('parameters.csv'):
        pid = md5(row.Description.encode('utf8')).hexdigest().upper()
        pid_map[row.ID] = pid
        segments.append(
            dict(ID=pid,
                 Name=row.Name,
                 Description=row.Description,
                 SegmentClass=row.SegmentClass,
                 **{f: getattr(row, f)
                    for f in features}))

    src = {}
    for row in read('contributions.csv'):
        src[row.ID] = row.References.split(
            ';') if row.References != 'no source given' else []
        src[row.ID] = [sid.lower() for sid in src[row.ID]]
        inventories[row.ID] = dict(ID=row.ID,
                                   Name=row.Name,
                                   Contributor_ID=row.Contributor_ID,
                                   URL=row.URI if row.URI != 'NA' else '',
                                   Source=src[row.ID])

    uniq = set()
    for row in read('values.csv'):
        pk = (row.Language_ID, row.Parameter_ID, row.Contribution_ID)
        if pk in uniq:
            print('skipping duplicate phoneme {0}'.format(pk))
            continue
        uniq.add(pk)
        lid = row.Language_ID if row.Language_ID in languoids else slug(
            inventories[row.Contribution_ID]['Name'])
        if lid not in languages:
            #
            # FIXME: Language_ID == 'NA' for three inventories! This must be mapped!
            #
            lang = languoids.get(lid)
            languages[lid] = dict(
                ID=lid,
                Name=lang.name if lang else None,
                Glottocode=lang.id if lang else None,
                ISO639P3code=row.ISO639P3code
                if row.ISO639P3code != 'NA' else None,
            )
        values.append(
            dict(
                ID=row.ID,
                Language_ID=lid,
                Parameter_ID=pid_map[row.Parameter_ID],
                Contribution_ID=row.Contribution_ID,
                Value=row.Name,
                Marginal=None if row.Marginal == 'NA' else eval(
                    row.Marginal.lower().capitalize()),  # FALSE|TRUE|NA
                Allophones=row.Allophones.split()
                if row.Allophones != 'NA' else [],
                Source=src[row.Contribution_ID],
            ))

    ds.write(
        **{
            'ValueTable': values,
            'LanguageTable': languages.values(),
            'ParameterTable': segments,
            'contributions.csv': inventories.values(),
            'contributors.csv': sources
        })
    ds.validate(logging.getLogger(__name__))
Ejemplo n.º 19
0
def test_Dataset_from_data_empty_file(tmpdir):
    write_text(str(tmpdir / 'values.csv'), '')
    with pytest.raises(ValueError, match='empty data file'):
        Dataset.from_data(str(tmpdir / 'values.csv'))
Ejemplo n.º 20
0
def main(scripts, dev, glr):
    cldf_dir = Path('cldf')
    bib = parse_string(read_text(cldf_dir / 'sources.bib'), bib_format='bibtex')
    for _, e in bib.entries.items():
        for field in e.fields:
            e.fields[field] = e.fields[field].replace('\\', '')
    write_text(cldf_dir / 'sources.bib', bib.lower().to_string('bibtex'))

    glottolog = Glottolog(glr)

    ds = StructureDataset.in_dir(cldf_dir)

    def describe_repos(r, org, name=None):
        return OrderedDict([
            ('dc:title', '{0}/{1}'.format(org, name or r.name)),
            ('dc:description', git_describe(r))])

    ds.tablegroup.common_props['prov:wasDerivedFrom'] = [
        describe_repos(dev, 'phoible'),
        describe_repos(scripts, 'bambooforest'),
        describe_repos(glottolog.repos, 'clld'),
    ]
    ds.tablegroup.common_props['prov:wasGeneratedBy'] = describe_repos(
        Path(__file__).parent, 'cldf-datasets', name='phoible')

    ds.add_columns(
        'ValueTable',
        {'name': 'Marginal', 'datatype': 'boolean'},
        {'name': 'Allophones', 'separator': ' '},
        'Contribution_ID')
    features = ["tone","stress","syllabic","short","long","consonantal","sonorant","continuant","delayedRelease","approximant","tap","trill","nasal","lateral","labial","round","labiodental","coronal","anterior","distributed","strident","dorsal","high","low","front","back","tense","retractedTongueRoot","advancedTongueRoot","periodicGlottalSource","epilaryngealSource","spreadGlottis","constrictedGlottis","fortis","raisedLarynxEjective","loweredLarynxImplosive","click"]
    ds.add_component('ParameterTable', 'SegmentClass', *features)
    ds.add_component('LanguageTable', 'Family_Glottocode', 'Family_Name')
    table = ds.add_table(
        'contributions.csv', 
        'ID', 
        'Name', 
        'Contributor_ID', 
        {'name': 'Source', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source', 'separator': ';'},
        'URL',
        {'name': 'count_phonemes', 'required': True, 'datatype': {'base': 'integer', 'minimum': 0}},
        {'name': 'count_consonants', 'required': True, 'datatype': {'base': 'integer', 'minimum': 0}},
        {'name': 'count_vowels', 'required': True, 'datatype': {'base': 'integer', 'minimum': 0}},
        {'name': 'count_tones', 'datatype': {'base': 'integer', 'minimum': 0}, 'null': 'NA'},
    )
    table.tableSchema.primaryKey = ['ID']
    table.tableSchema.foreignKeys.append(ForeignKey.fromdict(dict(
        columnReference='Contributor_ID',
        reference=dict(resource='contributors.csv', columnReference='ID'))))
    table.common_props['dc:conformsTo'] = None
    table = ds.add_table(
        'contributors.csv',
        'ID',
        'Name',
        'Description',
        'Readme',
        'Contents',
        {'name': 'Source', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source', 'separator': ';'},
        'URL',
        {'name': 'with_tones', 'datatype': {'base': 'boolean', 'format': '1|0'}},
    )
    table.tableSchema.primaryKey = ['ID']
    table.common_props['dc:conformsTo'] = None

    def read(what):
        return reader(scripts / 'to_cldf' / 'cldf' / what, namedtuples=True)

    languoids = {l.id: l for l in glottolog.languoids()}

    values, segments, languages, inventories, sources = [], [], OrderedDict(), OrderedDict(), []
    with_tones = {}
    for contrib in read('contributors.csv'):
        sources.append(dict(
            ID=contrib.Name,
            Name=contrib.Contributor,
            Description=contrib.Description,
            Readme=desc(dev, contrib.Name),
            Contents=contrib.Contents,
            Source=[c.strip().lower() for c in contrib.Citation.split(';')],
            URL=contrib.SourceURL if contrib.SourceURL != 'NA' else '',
            with_tones=contrib.with_tones == '1',
        ))
        with_tones[contrib.Name] = contrib.with_tones == '1'

    pid_map = {}
    for row in read('parameters.csv'):
        pid = md5(row.Description.encode('utf8')).hexdigest().upper()
        pid_map[row.ID] = (pid, row.SegmentClass)
        segments.append(dict(
            ID=pid,
            Name=row.Name,
            Description=row.Description,
            SegmentClass=row.SegmentClass,
            **{f: getattr(row, f) for f in features}
        ))

    src = {}
    for row in read('contributions.csv'):
        src[row.ID] = row.References.split(';') if row.References != 'no source given' else []
        src[row.ID] = [sid.lower() for sid in src[row.ID]]
        inventories[row.ID] = dict(
            ID=row.ID, 
            Name=row.Name, 
            Contributor_ID=row.Contributor_ID.upper(), 
            URL=row.URI if row.URI != 'NA' else '',
            Source=src[row.ID],
            count_phonemes=0,
            count_consonants=0,
            count_vowels=0,
            count_tones=0,
        )

    uniq, counts = set(), Counter()
    for row in read('values.csv'):
        pk = (row.Language_ID, row.Parameter_ID, row.Contribution_ID)
        if pk in uniq:
            print('skipping duplicate phoneme {0}'.format(pk))
            continue
        uniq.add(pk)
        lid = row.Language_ID if row.Language_ID in languoids else slug(inventories[row.Contribution_ID]['Name'])
        if lid not in languages:
            #
            # FIXME: Language_ID == 'NA' for three inventories! This must be mapped!
            #
            lang = languoids.get(lid)
            fam = lang.lineage[0] if lang and lang.lineage else None
            languages[lid] = dict(
                ID=lid,
                Name=lang.name if lang else None,
                Glottocode=lang.id if lang else None,
                ISO639P3code=row.ISO639P3code if row.ISO639P3code != 'NA' else None,
                Macroarea=lang.macroareas[0].value if lang and lang.macroareas else None,
                Latitude=lang.latitude if lang else None,
                Longitude=lang.longitude if lang else None,
                Family_Glottocode=fam[1] if fam else None,
                Family_Name=fam[0] if fam else None,
            )
        pid, sc = pid_map[row.Parameter_ID]
        counts.update([(row.Contribution_ID, sc)])
        values.append(dict(
            ID=row.ID,
            Language_ID=lid,
            Parameter_ID=pid,
            Contribution_ID=row.Contribution_ID,
            Value=row.Name,
            Marginal=None if row.Marginal == 'NA' else eval(row.Marginal.lower().capitalize()),  # FALSE|TRUE|NA
            Allophones=row.Allophones.split() if row.Allophones != 'NA' else [],
            Source=src[row.Contribution_ID],
        ))
    for key, count in counts.items():
        inventories[key[0]]['count_{0}s'.format(key[1])] = count
        inventories[key[0]]['count_phonemes'] += count

    for inv in inventories.values():
        if not with_tones[inv['Contributor_ID']]:
            assert inv['count_tones'] == 0
            inv['count_tones'] = 'NA'

    ds.write(**{
        'ValueTable': values,
        'LanguageTable': languages.values(),
        'ParameterTable': segments,
        'contributions.csv': inventories.values(),
        'contributors.csv': sources
    })
    ds.validate(logging.getLogger(__name__))
Ejemplo n.º 21
0
 def write_js_var(self, var_name, var, *path):
     p = self.path(*path)
     write_text(p, 'var ' + var_name + ' = ' + json.dumps(var, indent=2) + ';')
     self.file_written(p)
Ejemplo n.º 22
0
 def write_js_var(self, var_name, var, *path):
     p = self.path(*path)
     write_text(p,
                'var ' + var_name + ' = ' + json.dumps(var, indent=2) + ';')
     self.file_written(p)
Ejemplo n.º 23
0
def htmlmap(args, min_langs_for_legend_item=10):
    """
    glottolog --repos=. htmlmap [OUTDIR] [GLOTTOCODES]
    """
    nodes = {n.id: n for n in args.repos.languoids()}
    legend = Counter()

    glottocodes = None
    if len(args.args) > 1:
        glottocodes = read_text(args.args[1]).split()

    langs = []
    for n in nodes.values():
        if ((glottocodes is None
             and n.level == args.repos.languoid_levels.language) or
            (glottocodes and n.id in glottocodes)) and n.latitude != None:
            fid = n.lineage[0][1] if n.lineage else n.id
            if (not nodes[fid].category.startswith('Pseudo')) or fid == n.id:
                langs.append((n, fid))
                legend.update([fid])

    color_map = [fid for fid, _ in legend.most_common()]
    color_map = dict(zip(color_map, qualitative_colors(len(color_map))))
    print(color_map)

    def l2f(t):
        n, fid = t
        lon, lat = n.longitude, n.latitude
        if lon <= -26:
            lon += 360  # make the map pacific-centered.

        return {
            "geometry": {
                "coordinates": [lon, lat],
                "type": "Point"
            },
            "id": n.id,
            "properties": {
                "name": n.name,
                "color": color_map[fid],
                "family": nodes[fid].name,
                "family_id": fid,
            },
            "type": "Feature"
        }

    def legend_item(fid, c):
        return \
            '<span style="background-color: {0}; border: 1px solid black;">'\
            '&nbsp;&nbsp;&nbsp;</span> '\
            '<a href="https://glottolog.org/resource/languoid/id/{1}">{2}</a> ({3})'.format(
                color_map[fid], fid, nodes[fid].name, c)

    geojson = {
        "features": list(map(l2f, langs)),
        "properties": {
            "legend": {
                fid: legend_item(fid, c)
                for fid, c in legend.most_common()
                if c >= min_langs_for_legend_item
            },
        },
        "type": "FeatureCollection"
    }

    def rendered_template(name, **kw):
        return Template(
            read_text(
                Path(pyglottolog.__file__).parent.joinpath(
                    'templates', 'htmlmap', name))).substitute(**kw)

    jsname = 'glottolog_map.json'
    outdir = Path('.') if not args.args else Path(args.args[0])
    write_text(
        outdir.joinpath(jsname),
        rendered_template('htmlmap.js', geojson=dumps(geojson, indent=4)))
    html = outdir.joinpath('glottolog_map.html')
    write_text(
        html,
        rendered_template('htmlmap.html',
                          version=git_describe(args.repos.repos),
                          jsname=jsname,
                          nlangs=len(langs)))
    print(html.resolve().as_uri())
Ejemplo n.º 24
0
 def write(self, fname, text, encoding='utf8'):
     write_text(self.joinpath(fname), text, encoding=encoding)
     return fname
Ejemplo n.º 25
0
 def _download(self, **kw):
     self.cmd_download(**kw)
     write_text(
         self.raw / 'README.md',
         'Raw data downloaded {0}'.format(datetime.utcnow().isoformat()))
Ejemplo n.º 26
0
def htmlmap(args):
    """
    glottolog htmlmap [OUTDIR]
    """
    nodes = {n.id: n for n in args.repos.languoids()}
    legend = Counter()

    langs = []
    for n in nodes.values():
        if n.level == Level.language and n.latitude != None:
            fid = n.lineage[0][1] if n.lineage else n.id
            if not nodes[fid].category.startswith('Pseudo'):
                langs.append((n, fid))
                legend.update([fid])

    color_map = {
        fid: "{0:0{1}X}".format((i + 1) * 10, 3)
        for i, fid in enumerate(sorted(legend.keys()))
    }

    def l2f(t):
        n, fid = t
        lon, lat = n.longitude, n.latitude
        if lon <= -26:
            lon += 360

        return {
            "geometry": {
                "coordinates": [lon, lat],
                "type": "Point"
            },
            "id": n.id,
            "properties": {
                "name": n.name,
                "color": color_map[fid],
                "family": nodes[fid].name,
                "family_id": fid,
            },
            "type": "Feature"
        }

    def legend_item(fid, c):
        return \
            '<span style="background-color: #{0}; border: 1px solid black;">'\
            '&nbsp;&nbsp;&nbsp;</span> '\
            '<a href="http://glottolog.org/resource/languoid/id/{1}">{2}</a> ({3})'.format(
                color_map[fid], fid, nodes[fid].name, c)

    geojson = {
        "features": map(l2f, langs),
        "properties": {
            "legend": {
                fid: legend_item(fid, c)
                for fid, c in legend.most_common() if c > 10
            },
        },
        "type": "FeatureCollection"
    }

    def rendered_template(name, **kw):
        return Template(
            read_text(
                Path(pyglottolog.__file__).parent.joinpath(
                    'templates', 'htmlmap', name))).substitute(**kw)

    jsname = 'glottolog_map.json'
    outdir = Path('.') if not args.args else Path(args.args[0])
    write_text(
        outdir.joinpath(jsname),
        rendered_template('htmlmap.js', geojson=dumps(geojson, indent=4)))
    html = outdir.joinpath('glottolog_map.html')
    write_text(
        html,
        rendered_template('htmlmap.html',
                          version=git_describe(args.repos.repos),
                          jsname=jsname,
                          nlangs=len(langs)))
    print(html.resolve().as_uri())
Ejemplo n.º 27
0
def test_Dataset_from_data_empty_file(tmpdir):
    write_text(str(tmpdir / 'values.csv'), '')
    with pytest.raises(ValueError, match='empty data file'):
        Dataset.from_data(str(tmpdir / 'values.csv'))