Esempio n. 1
0
    def test_BibFile(self):
        bibfile = self.api.bibfiles['a.bib']
        self.assertEqual(bibfile['a:key'].type, 'misc')
        self.assertEqual(bibfile['s:Andalusi:Turk'].key, 's:Andalusi:Turk')

        for entry in bibfile.iterentries():
            if entry.key == 'key':
                self.assertEqual(len(list(entry.languoids({'abc': 1})[0])), 1)

        with self.assertRaises(KeyError):
            bibfile['xyz']

        self.assertEqual(len(list(bibfile.iterentries())), 3)

        lines = [line for line in read_text(bibfile.fname).split('\n')
                 if not line.strip().startswith('glottolog_ref_id')]
        write_text(self.tmp_path('a.bib'), '\n'.join(lines))
        bibfile.update(self.tmp_path('a.bib'))
        self.assertEqual(len(list(bibfile.iterentries())), 3)

        bibfile.update(self.api.bibfiles['b.bib'].fname)
        self.assertEqual(len(list(bibfile.iterentries())), 1)

        def visitor(entry):
            entry.fields['new_field'] = 'a'

        bibfile.visit(visitor=visitor)
        for entry in bibfile.iterentries():
            self.assertIn('new_field', entry.fields)

        bibfile.visit(visitor=lambda e: True)
        self.assertEqual(len(bibfile.keys()), 0)
Esempio n. 2
0
def test_read_write(tmppath):
    from clldutils.path import read_text, write_text

    text = 'äöüß'
    p = tmppath / 'test'
    assert write_text(p, text) == len(text)
    assert read_text(p) == text
Esempio n. 3
0
def new_dataset(args):
    """
    lexibank new-dataset OUTDIR [ID]
    """
    if not args.args:
        raise ParserError('you must specify an existing directory')
    outdir = Path(args.args.pop(0))
    if not outdir.exists():
        raise ParserError('you must specify an existing directory')

    id_pattern = re.compile('[a-z_0-9]+$')
    md = {}
    if args.args:
        md['id'] = args.args.pop(0)
    else:
        md['id'] = input('Dataset ID: ')

    while not id_pattern.match(md['id']):
        print(
            'dataset id must only consist of lowercase ascii letters, digits and _ (underscore)!'
        )
        md['id'] = input('Dataset ID: ')

    outdir = outdir / md['id']
    if not outdir.exists():
        outdir.mkdir()

    for key in ['title', 'url', 'license', 'conceptlist', 'citation']:
        md[key] = input('Dataset {0}: '.format(key))

    # check license!
    # check conceptlist!

    for path in Path(
            pylexibank.__file__).parent.joinpath('dataset_template').iterdir():
        if path.is_file():
            if path.suffix in ['.pyc']:
                continue  # pragma: no cover
            target = path.name
            content = read_text(path)
            if '+' in path.name:
                target = re.sub('\+([a-z]+)\+',
                                lambda m: '{' + m.groups()[0] + '}',
                                path.name).format(**md)
            if target.endswith('_tmpl'):
                target = target[:-5]
                content = content.format(**md)
            write_text(outdir / target, content)
        else:
            target = outdir / path.name
            if target.exists():
                shutil.rmtree(str(target))
            shutil.copytree(str(path), str(target))
    del md['id']
    jsonlib.dump(md, outdir / 'metadata.json', indent=4)
Esempio n. 4
0
def ds(tmpdir):
    ds = ParallelText.in_dir(str(tmpdir))
    ds.add_component('FunctionalEquivalentTable')
    ds.add_component('FunctionalEquivalentsetTable')
    for fname in [
            'forms.csv',
            'functionalEquivalents.csv',
            'functionalEquivalentsets.csv',
    ]:
        src = Path(__file__).parent / 'data' / 'paralleltext_{0}'.format(fname)
        target = tmpdir.join(fname)
        target.write(read_text(src).encode('utf8'), mode='wb')
    return ds
Esempio n. 5
0
def ds(tmpdir):
    ds = ParallelText.in_dir(str(tmpdir))
    ds.add_component('FunctionalEquivalentTable')
    ds.add_component('FunctionalEquivalentsetTable')
    for fname in [
        'forms.csv',
        'functionalEquivalents.csv',
        'functionalEquivalentsets.csv',
    ]:
        src = Path(__file__).parent / 'data' / 'paralleltext_{0}'.format(fname)
        target = tmpdir.join(fname)
        target.write(read_text(src).encode('utf8'), mode='wb')
    return ds
Esempio n. 6
0
def get_text(what, id_, fmt):
    p = text_path(what, '{0}.{1}'.format(id_, fmt))
    if not p.exists():
        raise ValueError(p)
    if fmt == 'json':
        return jsonlib.load(p)
    text = read_text(p)
    if fmt == 'css':
        return text
    body = bs(text).find('body')
    body.name = 'div'
    body.attrs.clear()
    return '{0}'.format(body).replace('.popover(', '.clickover(')
Esempio n. 7
0
File: util.py Progetto: clld/apics
def get_text(what, id_, fmt):
    p = text_path(what, '{0}.{1}'.format(id_, fmt))
    if not p.exists():
        raise ValueError(p)
    if fmt == 'json':
        return jsonlib.load(p)
    text = read_text(p)
    if fmt == 'css':
        return text
    body = bs(text).find('body')
    body.name = 'div'
    body.attrs.clear()
    return '{0}'.format(body).replace('.popover(', '.clickover(')
Esempio n. 8
0
def read_url(path, cache_dir=None, log=None):
    """
    Delegate scraping to clldutils, since nowadays this requires tweaking the user agent as well.
    """
    if cache_dir:
        cache_dir = Path(cache_dir)
        if log:  # pragma: no cover
            log.debug('retrieving {0} ...'.format(path))
        fpath = cache_dir / hashlib.md5(path.encode('utf8')).hexdigest()
        if not fpath.exists():
            with iso_639_3._open(path) as fp:
                write_text(fpath, fp.read().decode('utf8'))
        else:  # pragma: no cover
            if log:
                log.debug('... from cache {0}'.format(fpath))
        return read_text(fpath)

    with iso_639_3._open(path) as fp:
        return fp.read().decode('utf8')
Esempio n. 9
0
def read_editors(repos):
    res = []
    in_editors, in_table = False, False
    for line in read_text(repos.path('CONTRIBUTORS.md')).split('\n'):
        line = line.strip()
        if line.startswith('##'):
            if in_editors:
                in_editors = False
            elif line.endswith('Editors'):
                in_editors = True
            continue

        if line.startswith('---'):
            in_table = True
            continue

        if in_editors and in_table and line.strip():
            row = [m.strip() for m in line.split('|')]
            row[2] = [n.strip() for n in row[2].split('&')]
            res.append(row)
    return sorted(res,
                  key=lambda t: tuple(map(int, t[0].split('.'))),
                  reverse=True)
Esempio n. 10
0
def test_BibFile(tmpdir, bibfiles):
    bf = bibfiles['a.bib']
    assert bf['a:key'].type == 'misc'
    assert bf['s:Andalusi:Turk'].key == 's:Andalusi:Turk'

    for entry in bf.iterentries():
        if entry.key == 'key':
            assert len(list(entry.languoids({'abc': 1})[0])) == 1

    with pytest.raises(KeyError):
        bf['xyz']

    assert len(list(bf.iterentries())) == 3

    lines = [line for line in read_text(bf.fname).split('\n')
             if not line.strip().startswith('glottolog_ref_id')]
    write_text(str(tmpdir / 'a.bib'), '\n'.join(lines))

    entries = bf.load()  # FIXME
    bf.fname = str(tmpdir / ' newa.bib')
    bf.save(entries)

    bf.update(str(tmpdir / 'a.bib'))
    assert len(list(bf.iterentries())) == 3

    bf.update(bibfiles['b.bib'].fname)
    assert len(list(bf.iterentries())) == 1

    def visitor(entry):
        entry.fields['new_field'] = 'a'

    bf.visit(visitor=visitor)
    for entry in bf.iterentries():
        assert 'new_field' in entry.fields

    bf.visit(visitor=lambda e: True)
    assert len(bf.keys()) == 0
Esempio n. 11
0
 def read(self, fname):
     self._add_entries(database.parse_string(read_text(fname), bib_format='bibtex'))
Esempio n. 12
0
def main(scripts, dev, glr):
    cldf_dir = Path('cldf')
    bib = parse_string(read_text(cldf_dir / 'sources.bib'),
                       bib_format='bibtex')
    write_text(cldf_dir / 'sources.bib', bib.lower().to_string('bibtex'))

    glottolog = Glottolog(glr)
    ds = StructureDataset.in_dir(cldf_dir)
    ds.tablegroup.notes.append(
        OrderedDict([('dc:title', 'environment'),
                     ('properties',
                      OrderedDict([
                          ('glottolog_version', git_describe(glottolog.repos)),
                      ]))]))
    ds.add_columns('ValueTable', {
        'name': 'Marginal',
        'datatype': 'boolean'
    }, {
        'name': 'Allophones',
        'separator': ' '
    }, 'Contribution_ID')
    features = [
        "tone", "stress", "syllabic", "short", "long", "consonantal",
        "sonorant", "continuant", "delayedRelease", "approximant", "tap",
        "trill", "nasal", "lateral", "labial", "round", "labiodental",
        "coronal", "anterior", "distributed", "strident", "dorsal", "high",
        "low", "front", "back", "tense", "retractedTongueRoot",
        "advancedTongueRoot", "periodicGlottalSource", "epilaryngealSource",
        "spreadGlottis", "constrictedGlottis", "fortis",
        "raisedLarynxEjective", "loweredLarynxImplosive", "click"
    ]
    ds.add_component('ParameterTable', 'SegmentClass', *features)
    ds.add_component('LanguageTable')
    ds.add_table(
        'contributions.csv', 'ID', 'Name', 'Contributor_ID', {
            'name': 'Source',
            'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source',
            'separator': ';'
        }, 'URL')
    ds.add_table(
        'contributors.csv',
        'ID',
        'Name',
        'Description',
        'Readme',
        'Contents',
        {
            'name': 'Source',
            'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source',
            'separator': ';'
        },
        'URL',
    )

    def read(what):
        return reader(scripts / 'to_cldf' / 'cldf' / what, namedtuples=True)

    languoids = {l.id: l for l in glottolog.languoids()}

    values, segments, languages, inventories, sources = [], [], {}, {}, []
    for contrib in read('contributors.csv'):
        sources.append(
            dict(
                ID=contrib.Name,
                Name=contrib.Contributor,
                Description=contrib.Description,
                Readme=desc(dev, contrib.Name),
                Contents=contrib.Contents,
                Source=[
                    c.strip().lower() for c in contrib.Citation.split(';')
                ],
                URL=contrib.SourceURL if contrib.SourceURL != 'NA' else '',
            ))

    pid_map = {}
    for row in read('parameters.csv'):
        pid = md5(row.Description.encode('utf8')).hexdigest().upper()
        pid_map[row.ID] = pid
        segments.append(
            dict(ID=pid,
                 Name=row.Name,
                 Description=row.Description,
                 SegmentClass=row.SegmentClass,
                 **{f: getattr(row, f)
                    for f in features}))

    src = {}
    for row in read('contributions.csv'):
        src[row.ID] = row.References.split(
            ';') if row.References != 'no source given' else []
        src[row.ID] = [sid.lower() for sid in src[row.ID]]
        inventories[row.ID] = dict(ID=row.ID,
                                   Name=row.Name,
                                   Contributor_ID=row.Contributor_ID,
                                   URL=row.URI if row.URI != 'NA' else '',
                                   Source=src[row.ID])

    uniq = set()
    for row in read('values.csv'):
        pk = (row.Language_ID, row.Parameter_ID, row.Contribution_ID)
        if pk in uniq:
            print('skipping duplicate phoneme {0}'.format(pk))
            continue
        uniq.add(pk)
        lid = row.Language_ID if row.Language_ID in languoids else slug(
            inventories[row.Contribution_ID]['Name'])
        if lid not in languages:
            #
            # FIXME: Language_ID == 'NA' for three inventories! This must be mapped!
            #
            lang = languoids.get(lid)
            languages[lid] = dict(
                ID=lid,
                Name=lang.name if lang else None,
                Glottocode=lang.id if lang else None,
                ISO639P3code=row.ISO639P3code
                if row.ISO639P3code != 'NA' else None,
            )
        values.append(
            dict(
                ID=row.ID,
                Language_ID=lid,
                Parameter_ID=pid_map[row.Parameter_ID],
                Contribution_ID=row.Contribution_ID,
                Value=row.Name,
                Marginal=None if row.Marginal == 'NA' else eval(
                    row.Marginal.lower().capitalize()),  # FALSE|TRUE|NA
                Allophones=row.Allophones.split()
                if row.Allophones != 'NA' else [],
                Source=src[row.Contribution_ID],
            ))

    ds.write(
        **{
            'ValueTable': values,
            'LanguageTable': languages.values(),
            'ParameterTable': segments,
            'contributions.csv': inventories.values(),
            'contributors.csv': sources
        })
    ds.validate(logging.getLogger(__name__))
Esempio n. 13
0
def to_cldf(wordlist, path='cldf', source_path=None, ref="cogid",
        segments="tokens", form="ipa", note='note', form_in_source="value",
        source=None, alignment=None):
    """Convert a wordlist in LingPy to CLDF.
    
    Parameters
    ----------
    wordlist : ~lingpy.basic.wordlist.Wordlist
        A regular Wordlist object (or similar).
    path : str (default='cldf')
        The name of the directory to which the files will be written.
    source_path : str (default=None)
        If available, specify the path of your BibTex file with the sources.
    ref : str (default="cogid")
        The column in which the cognate sets are stored.
    segments : str (default="tokens")
        The column in which the segmented phonetic strings are stored.
    form : str (default="ipa")
        The column in which the unsegmented phonetic strings are stored.
    note : str (default=None)
        The column in which you store your comments.
    form_in_source : str (default=None)
        The column in which you store the original form in the source.
    source : str (default=None)
        The column in which you store your source information. 
    alignment : str (default="alignment")
        The column in which you store the alignments.
    """
    if not cldf:
        raise ValueError('The package pycldf needs to be installed')

    # create cldf-dataset
    ds = CLDF_Wordlist.in_dir(path)
    # add sources if they are available
    ds.add_sources(
            read_text(source_path) if source_path else '')
    # add components
    ds.add_component('LanguageTable')
    ds.add_component('ParameterTable', 'Concepticon_ID')
    ds.add_component('CognateTable')
    ds.add_columns('FormTable', 'form_in_source')

    languages, parameters, forms, cognates = {}, {}, [], []
    for idx in wordlist:
        lid = slug(wordlist[idx, 'doculect'])
        if lid not in languages:
            languages[lid] = dict(
                    ID=lid,
                    Name=wordlist[idx, 'doculect'],
                    Glottocode = wordlist[idx, 'glottocode'])

        pid = wordlist[idx, 'concepticon_id'] or slug(wordlist[idx, 'concept'])
        if pid not in parameters:
            parameters[pid] = dict(
                ID=pid,
                Name=wordlist[idx, 'concept'],
                Concepticon_ID=wordlist[idx, 'concepticon_id'])

        forms.append(dict(
            ID=str(idx),
            Language_ID=lid,
            Parameter_ID=pid,
            form_in_source=wordlist[idx, form_in_source] or '' if form_in_source else '',
            Form=wordlist[idx, form] or '' if form else '',
            Segments=wordlist[idx, segments] or '' if segments else '',
            Source=[wordlist[idx, source]] or [] if source else [],
            Comment=wordlist[idx, note] or '' if note else ''))

        if ref:
            cognates.append(dict(ID=str(idx), Form_ID=str(idx),
                Cognateset_ID=wordlist[idx, ref], Alignment=wordlist[idx,
                    alignment] or [''] if alignment else ['']))

    ds.write(
        FormTable=forms,
        LanguageTable=languages.values(),
        ParameterTable=parameters.values(),
        CognateTable=cognates)
Esempio n. 14
0
 def legalcode(self):
     p = Path(__file__).parent / 'legalcode' / self.id
     if p.exists():
         return read_text(p)
Esempio n. 15
0
def htmlmap(args, min_langs_for_legend_item=10):
    """
    glottolog --repos=. htmlmap [OUTDIR] [GLOTTOCODES]
    """
    nodes = {n.id: n for n in args.repos.languoids()}
    legend = Counter()

    glottocodes = None
    if len(args.args) > 1:
        glottocodes = read_text(args.args[1]).split()

    langs = []
    for n in nodes.values():
        if ((glottocodes is None
             and n.level == args.repos.languoid_levels.language) or
            (glottocodes and n.id in glottocodes)) and n.latitude != None:
            fid = n.lineage[0][1] if n.lineage else n.id
            if (not nodes[fid].category.startswith('Pseudo')) or fid == n.id:
                langs.append((n, fid))
                legend.update([fid])

    color_map = [fid for fid, _ in legend.most_common()]
    color_map = dict(zip(color_map, qualitative_colors(len(color_map))))
    print(color_map)

    def l2f(t):
        n, fid = t
        lon, lat = n.longitude, n.latitude
        if lon <= -26:
            lon += 360  # make the map pacific-centered.

        return {
            "geometry": {
                "coordinates": [lon, lat],
                "type": "Point"
            },
            "id": n.id,
            "properties": {
                "name": n.name,
                "color": color_map[fid],
                "family": nodes[fid].name,
                "family_id": fid,
            },
            "type": "Feature"
        }

    def legend_item(fid, c):
        return \
            '<span style="background-color: {0}; border: 1px solid black;">'\
            '&nbsp;&nbsp;&nbsp;</span> '\
            '<a href="https://glottolog.org/resource/languoid/id/{1}">{2}</a> ({3})'.format(
                color_map[fid], fid, nodes[fid].name, c)

    geojson = {
        "features": list(map(l2f, langs)),
        "properties": {
            "legend": {
                fid: legend_item(fid, c)
                for fid, c in legend.most_common()
                if c >= min_langs_for_legend_item
            },
        },
        "type": "FeatureCollection"
    }

    def rendered_template(name, **kw):
        return Template(
            read_text(
                Path(pyglottolog.__file__).parent.joinpath(
                    'templates', 'htmlmap', name))).substitute(**kw)

    jsname = 'glottolog_map.json'
    outdir = Path('.') if not args.args else Path(args.args[0])
    write_text(
        outdir.joinpath(jsname),
        rendered_template('htmlmap.js', geojson=dumps(geojson, indent=4)))
    html = outdir.joinpath('glottolog_map.html')
    write_text(
        html,
        rendered_template('htmlmap.html',
                          version=git_describe(args.repos.repos),
                          jsname=jsname,
                          nlangs=len(langs)))
    print(html.resolve().as_uri())
Esempio n. 16
0
 def rendered_template(name, **kw):
     return Template(
         read_text(
             Path(pyglottolog.__file__).parent.joinpath(
                 'templates', 'htmlmap', name))).substitute(**kw)
Esempio n. 17
0
 def read(self, fname, encoding='utf8'):
     return read_text(self.joinpath(fname), encoding=encoding)
Esempio n. 18
0
def app_name(project_dir):
    setup = read_text(project_dir / 'setup.py')
    match = re.search('main\s*=\s*(?P<name>[a-z0-9]+):main', setup)
    if match:
        return match.group('name')
Esempio n. 19
0
def to_cldf(wordlist,
            path='cldf',
            source_path=None,
            ref="cogid",
            segments="tokens",
            form="ipa",
            note='note',
            form_in_source="value",
            source=None,
            alignment=None):
    """Convert a wordlist in LingPy to CLDF.
    
    Parameters
    ----------
    wordlist : ~lingpy.basic.wordlist.Wordlist
        A regular Wordlist object (or similar).
    path : str (default='cldf')
        The name of the directory to which the files will be written.
    source_path : str (default=None)
        If available, specify the path of your BibTex file with the sources.
    ref : str (default="cogid")
        The column in which the cognate sets are stored.
    segments : str (default="tokens")
        The column in which the segmented phonetic strings are stored.
    form : str (default="ipa")
        The column in which the unsegmented phonetic strings are stored.
    note : str (default=None)
        The column in which you store your comments.
    form_in_source : str (default=None)
        The column in which you store the original form in the source.
    source : str (default=None)
        The column in which you store your source information. 
    alignment : str (default="alignment")
        The column in which you store the alignments.
    """
    if not cldf:
        raise ValueError('The package pycldf needs to be installed')

    # create cldf-dataset
    ds = CLDF_Wordlist.in_dir(path)
    # add sources if they are available
    ds.add_sources(read_text(source_path) if source_path else '')
    # add components
    ds.add_component('LanguageTable')
    ds.add_component('ParameterTable', 'Concepticon_ID')
    ds.add_component('CognateTable')
    ds.add_columns('FormTable', 'form_in_source')

    languages, parameters, forms, cognates = {}, {}, [], []
    for idx in wordlist:
        lid = slug(wordlist[idx, 'doculect'])
        if lid not in languages:
            languages[lid] = dict(ID=lid,
                                  Name=wordlist[idx, 'doculect'],
                                  glottocode=wordlist[idx, 'glottocode'])

        pid = wordlist[idx, 'concepticon_id'] or slug(wordlist[idx, 'concept'])
        if pid not in parameters:
            parameters[pid] = dict(ID=pid,
                                   Name=wordlist[idx, 'concept'],
                                   Concepticon_ID=wordlist[idx,
                                                           'concepticon_id'])

        forms.append(
            dict(ID=str(idx),
                 Language_ID=lid,
                 Parameter_ID=pid,
                 form_in_source=wordlist[idx, form_in_source] or ''
                 if form_in_source else '',
                 Form=wordlist[idx, form] or '' if form else '',
                 Segments=wordlist[idx, segments] or '' if segments else '',
                 Source=[wordlist[idx, source]] or [] if source else [],
                 Comment=wordlist[idx, note] or '' if note else ''))

        if ref:
            cognates.append(
                dict(ID=str(idx),
                     Form_ID=str(idx),
                     Cognateset_ID=wordlist[idx, ref],
                     Alignment=wordlist[idx, alignment] or ['']
                     if alignment else ['']))

    ds.write(FormTable=forms,
             LanguageTable=languages.values(),
             ParameterTable=parameters.values(),
             CognateTable=cognates)
Esempio n. 20
0
 def add_sources(self, ds):
     ds.add_sources(read_text(self.raw / 'sources.bib'))
Esempio n. 21
0
def main(scripts, dev, glr):
    cldf_dir = Path('cldf')
    bib = parse_string(read_text(cldf_dir / 'sources.bib'), bib_format='bibtex')
    for _, e in bib.entries.items():
        for field in e.fields:
            e.fields[field] = e.fields[field].replace('\\', '')
    write_text(cldf_dir / 'sources.bib', bib.lower().to_string('bibtex'))

    glottolog = Glottolog(glr)

    ds = StructureDataset.in_dir(cldf_dir)

    def describe_repos(r, org, name=None):
        return OrderedDict([
            ('dc:title', '{0}/{1}'.format(org, name or r.name)),
            ('dc:description', git_describe(r))])

    ds.tablegroup.common_props['prov:wasDerivedFrom'] = [
        describe_repos(dev, 'phoible'),
        describe_repos(scripts, 'bambooforest'),
        describe_repos(glottolog.repos, 'clld'),
    ]
    ds.tablegroup.common_props['prov:wasGeneratedBy'] = describe_repos(
        Path(__file__).parent, 'cldf-datasets', name='phoible')

    ds.add_columns(
        'ValueTable',
        {'name': 'Marginal', 'datatype': 'boolean'},
        {'name': 'Allophones', 'separator': ' '},
        'Contribution_ID')
    features = ["tone","stress","syllabic","short","long","consonantal","sonorant","continuant","delayedRelease","approximant","tap","trill","nasal","lateral","labial","round","labiodental","coronal","anterior","distributed","strident","dorsal","high","low","front","back","tense","retractedTongueRoot","advancedTongueRoot","periodicGlottalSource","epilaryngealSource","spreadGlottis","constrictedGlottis","fortis","raisedLarynxEjective","loweredLarynxImplosive","click"]
    ds.add_component('ParameterTable', 'SegmentClass', *features)
    ds.add_component('LanguageTable', 'Family_Glottocode', 'Family_Name')
    table = ds.add_table(
        'contributions.csv', 
        'ID', 
        'Name', 
        'Contributor_ID', 
        {'name': 'Source', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source', 'separator': ';'},
        'URL',
        {'name': 'count_phonemes', 'required': True, 'datatype': {'base': 'integer', 'minimum': 0}},
        {'name': 'count_consonants', 'required': True, 'datatype': {'base': 'integer', 'minimum': 0}},
        {'name': 'count_vowels', 'required': True, 'datatype': {'base': 'integer', 'minimum': 0}},
        {'name': 'count_tones', 'datatype': {'base': 'integer', 'minimum': 0}, 'null': 'NA'},
    )
    table.tableSchema.primaryKey = ['ID']
    table.tableSchema.foreignKeys.append(ForeignKey.fromdict(dict(
        columnReference='Contributor_ID',
        reference=dict(resource='contributors.csv', columnReference='ID'))))
    table.common_props['dc:conformsTo'] = None
    table = ds.add_table(
        'contributors.csv',
        'ID',
        'Name',
        'Description',
        'Readme',
        'Contents',
        {'name': 'Source', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source', 'separator': ';'},
        'URL',
        {'name': 'with_tones', 'datatype': {'base': 'boolean', 'format': '1|0'}},
    )
    table.tableSchema.primaryKey = ['ID']
    table.common_props['dc:conformsTo'] = None

    def read(what):
        return reader(scripts / 'to_cldf' / 'cldf' / what, namedtuples=True)

    languoids = {l.id: l for l in glottolog.languoids()}

    values, segments, languages, inventories, sources = [], [], OrderedDict(), OrderedDict(), []
    with_tones = {}
    for contrib in read('contributors.csv'):
        sources.append(dict(
            ID=contrib.Name,
            Name=contrib.Contributor,
            Description=contrib.Description,
            Readme=desc(dev, contrib.Name),
            Contents=contrib.Contents,
            Source=[c.strip().lower() for c in contrib.Citation.split(';')],
            URL=contrib.SourceURL if contrib.SourceURL != 'NA' else '',
            with_tones=contrib.with_tones == '1',
        ))
        with_tones[contrib.Name] = contrib.with_tones == '1'

    pid_map = {}
    for row in read('parameters.csv'):
        pid = md5(row.Description.encode('utf8')).hexdigest().upper()
        pid_map[row.ID] = (pid, row.SegmentClass)
        segments.append(dict(
            ID=pid,
            Name=row.Name,
            Description=row.Description,
            SegmentClass=row.SegmentClass,
            **{f: getattr(row, f) for f in features}
        ))

    src = {}
    for row in read('contributions.csv'):
        src[row.ID] = row.References.split(';') if row.References != 'no source given' else []
        src[row.ID] = [sid.lower() for sid in src[row.ID]]
        inventories[row.ID] = dict(
            ID=row.ID, 
            Name=row.Name, 
            Contributor_ID=row.Contributor_ID.upper(), 
            URL=row.URI if row.URI != 'NA' else '',
            Source=src[row.ID],
            count_phonemes=0,
            count_consonants=0,
            count_vowels=0,
            count_tones=0,
        )

    uniq, counts = set(), Counter()
    for row in read('values.csv'):
        pk = (row.Language_ID, row.Parameter_ID, row.Contribution_ID)
        if pk in uniq:
            print('skipping duplicate phoneme {0}'.format(pk))
            continue
        uniq.add(pk)
        lid = row.Language_ID if row.Language_ID in languoids else slug(inventories[row.Contribution_ID]['Name'])
        if lid not in languages:
            #
            # FIXME: Language_ID == 'NA' for three inventories! This must be mapped!
            #
            lang = languoids.get(lid)
            fam = lang.lineage[0] if lang and lang.lineage else None
            languages[lid] = dict(
                ID=lid,
                Name=lang.name if lang else None,
                Glottocode=lang.id if lang else None,
                ISO639P3code=row.ISO639P3code if row.ISO639P3code != 'NA' else None,
                Macroarea=lang.macroareas[0].value if lang and lang.macroareas else None,
                Latitude=lang.latitude if lang else None,
                Longitude=lang.longitude if lang else None,
                Family_Glottocode=fam[1] if fam else None,
                Family_Name=fam[0] if fam else None,
            )
        pid, sc = pid_map[row.Parameter_ID]
        counts.update([(row.Contribution_ID, sc)])
        values.append(dict(
            ID=row.ID,
            Language_ID=lid,
            Parameter_ID=pid,
            Contribution_ID=row.Contribution_ID,
            Value=row.Name,
            Marginal=None if row.Marginal == 'NA' else eval(row.Marginal.lower().capitalize()),  # FALSE|TRUE|NA
            Allophones=row.Allophones.split() if row.Allophones != 'NA' else [],
            Source=src[row.Contribution_ID],
        ))
    for key, count in counts.items():
        inventories[key[0]]['count_{0}s'.format(key[1])] = count
        inventories[key[0]]['count_phonemes'] += count

    for inv in inventories.values():
        if not with_tones[inv['Contributor_ID']]:
            assert inv['count_tones'] == 0
            inv['count_tones'] = 'NA'

    ds.write(**{
        'ValueTable': values,
        'LanguageTable': languages.values(),
        'ParameterTable': segments,
        'contributions.csv': inventories.values(),
        'contributors.csv': sources
    })
    ds.validate(logging.getLogger(__name__))
Esempio n. 22
0
 def read(self, fname):
     self._add_entries(
         database.parse_string(read_text(fname), bib_format='bibtex'))