Esempio n. 1
0
def test_create(api, wiki, capsys, tmp_path):
    cldf_repos = tmp_path
    cldf.create(StructureDataset.in_dir(cldf_repos / 'cldf'), api,
                Path(__file__).parent / 'glottolog')
    #captured = capsys.readouterr()
    #assert 'inconsistent' in captured.out
    ds = StructureDataset.from_metadata(cldf_repos / 'cldf' /
                                        'StructureDataset-metadata.json')
    assert len(list(ds['ValueTable'])) == 1
    assert ds['contributors.csv', 'Photo'].valueUrl.expand(list(ds['contributors.csv'])[0]) == \
           'https://glottobank.org/photos/abc'
Esempio n. 2
0
def main(args):  # pragma: no cover
    ds = StructureDataset.from_metadata(DS)
    data = Data()
    for source in ds.sources:
        data.add(common.Source, source.id, _obj=bibtex2source(source))

    ext = [
        Record.from_string('@' + s, lowercase=True)
        for s in nfilter(BIB.split('@'))
    ]
    for rec in ext:
        if rec.id not in data['Source']:
            data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    for contrib in ds['contributors.csv']:
        o = data.add(
            common.Contributor,
            contrib['ID'],
            id=contrib['ID'].upper(),
            name=contrib['Name'],
            description=contrib['Description'],
            url=contrib['URL'],
            jsondata={
                'readme': contrib['Readme'],
                'contents': contrib['Contents']
            },
        )
        for src in contrib['Source']:
            DBSession.add(
                models.ContributorReference(source=data['Source'][src],
                                            contributor=o))

    dataset = data.add(
        common.Dataset,
        'phoible',
        id='phoible',
        name='PHOIBLE 2.0',
        description='PHOIBLE 2.0',
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="https://www.shh.mpg.de",
        domain='phoible.org',
        license='https://creativecommons.org/licenses/by-sa/3.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon':
            'https://i.creativecommons.org/l/by-sa/3.0/88x31.png',
            'license_name':
            'Creative Commons Attribution-ShareAlike 3.0 Unported License'
        })

    for i, (cid, name) in enumerate([
        ('UZ', "Steven Moran"),
        ('mccloy', "Daniel McCloy"),
    ],
                                    start=1):
        contrib = data['Contributor'].get(cid)
        if not contrib:
            contrib = common.Contributor(id=cid, name=name)
        DBSession.add(
            common.Editor(dataset=dataset, ord=i, contributor=contrib))

    glottolog = Glottolog(
        Path(phoible.__file__).parent.parent.parent.parent.joinpath(
            'glottolog', 'glottolog'))

    for lang in ds['LanguageTable']:
        l = data.add(
            models.Variety,
            lang['ID'],
            id=lang['ID'],
            name=lang['Name'],
        )

    load_families(data, [(l.id, l)
                         for l in data['Variety'].values() if len(l.id) == 8],
                  glottolog.repos)
    DBSession.flush()

    # assign color codes:
    families = defaultdict(list)
    for l in data['Variety'].values():
        families[l.family_pk].append(l)

    colors = color.qualitative_colors(len(families))
    for i, langs in enumerate(sorted(families.values(),
                                     key=lambda v: -len(v))):
        for l in langs:
            l.jsondata = {'color': colors[i]}

    for segment in ds['ParameterTable']:
        equivalence_class = ''.join([
            t[0] for t in [(c, unicodedata.name(c)) for c in segment['Name']]
            if t[1].split()[0] not in ['COMBINING', 'MODIFIER']
        ]),
        data.add(models.Segment,
                 segment['ID'],
                 id=segment['ID'],
                 name=segment['Name'],
                 description=segment['Description'],
                 segment_class=segment['SegmentClass'],
                 equivalence_class=equivalence_class)
    DBSession.flush()

    # Add redirects for old language pages! get relevant ISO codes and map to Glottocode!
    for model, repls in load(
            Path(phoible.__file__).parent.parent /
            'replacements.json').items():
        if model == 'Language':
            languoids = {l.id: l for l in glottolog.languoids()}
            iso_languoids = {l.iso: l for l in languoids.values() if l.iso}
            gl_in_phoible = set(data['Variety'].keys())
            for oid, nid in repls.items():
                gls = descendants_from_nodemap(
                    iso_languoids.get(oid),
                    languoids).intersection(gl_in_phoible)
                if gls:
                    nid = gls.pop()
                    if len(gls) > 1:
                        print('+++', oid, gls)
                else:
                    print('---', oid)
                common.Config.add_replacement(oid, nid, common.Language)
        elif model == 'Parameter':
            segments_in_phoible = set(data['Segment'].keys())
            for oid, nid in repls.items():
                id_ = nid if nid in segments_in_phoible else None
                common.Config.add_replacement(oid, id_, common.Parameter)

    for segment in ds['ParameterTable']:
        for i, (k, v) in enumerate(sorted(segment.items())):
            if k not in ['ID', 'Name', 'Description', 'SegmentClass']:
                DBSession.add(
                    common.Parameter_data(
                        key=feature_name(k),
                        value=v,
                        ord=i,
                        object_pk=data['Segment'][segment['ID']].pk))

    for inventory in ds['contributions.csv']:
        inv = data.add(
            models.Inventory,
            inventory['ID'],
            id=inventory['ID'],
            name='{0} ({1} {2})'.format(
                inventory['Name'],
                inventory['Contributor_ID'].upper(),
                inventory['ID'],
            ),
            source_url=inventory['URL'],
            count_tone=inventory['count_tones'],
            count_vowel=inventory['count_vowels'],
            count_consonant=inventory['count_consonants'],
        )
        DBSession.add(
            common.ContributionContributor(
                contribution=inv,
                contributor=data['Contributor'][
                    inventory['Contributor_ID'].upper()]))
        for src in inventory['Source']:
            DBSession.add(
                common.ContributionReference(contribution=inv,
                                             source=data['Source'][src]))

    for phoneme in ds['ValueTable']:
        lang = data['Variety'][phoneme['Language_ID']]
        inv = data['Inventory'][phoneme['Contribution_ID']]
        if not inv.language:
            inv.language = lang
        vs = common.ValueSet(
            id=phoneme['ID'],
            contribution=inv,
            language=lang,
            parameter=data['Segment'][phoneme['Parameter_ID']])

        for ref in phoneme['Source']:
            DBSession.add(
                common.ValueSetReference(source=data['Source'][ref],
                                         valueset=vs))

        DBSession.add(
            models.Phoneme(
                id=phoneme['ID'],
                name='%s %s' %
                (phoneme['Value'],
                 data['Inventory'][phoneme['Contribution_ID']].name),
                allophones=' '.join(phoneme['Allophones']),
                marginal=phoneme['Marginal'],
                valueset=vs))

    return
Esempio n. 3
0
    def cmd_makecldf(self, args):
        #
        # Augment the schema of the rather simplistic CLDF download:
        #
        ds = args.writer.cldf
        # Add tables for controlled vocabularies:
        ds.add_table('regions.csv', 'ID', 'Name')
        ds.add_table('varietytypes.csv', 'ID', 'Name', 'Description')
        ds.add_table('featurecategories.csv', 'ID', 'Name', 'Description')
        ds.add_table('contributors.csv', 'ID', 'Name', 'URL', 'Address',
                     'Email')

        # We merge the data from contributions.csv into languages.csv for simplicity:
        ds.remove_table('contributions.csv')

        # Varieties have a region, a type, an abbreviation and contributors.
        ds.add_columns('LanguageTable', 'Description', 'Region_ID', 'Type_ID',
                       'abbr', {
                           'name': 'Contributor_ID',
                           'separator': ' '
                       })
        ds['LanguageTable'].add_foreign_key('Region_ID', 'regions.csv', 'ID')
        ds['LanguageTable'].add_foreign_key('Type_ID', 'varietytypes.csv',
                                            'ID')
        ds['LanguageTable'].add_foreign_key('Contributor_ID',
                                            'contributors.csv', 'ID')

        # Features have a category and a typical example, with source.
        ds.add_columns(
            'ParameterTable',
            'Category_ID',
            'Example_Source',
            {
                'name':
                'Attestation',
                'datatype':
                'float',
                'dc:description':
                "Attestation is a relative measure of how widespread a feature is in the set "
                "of eWAVE varieties. It is expressed as a percentage and is calculated as the "
                "sum of all A-, B- and C-ratings for a feature, divided by the number of "
                "varieties in the eWAVE dataset. The closer the value to 100%, the more "
                "widespread the feature is.",
            },
            {
                'name':
                'Pervasiveness',
                'datatype':
                'float',
                'dc:description':
                """\
Pervasiveness provides a measure of how pervasive a feature is on average in the varieties in 
which it is attested. Pervasiveness is calculated as all A-ratings for a feature plus 0.6 times 
the B-ratings for the same feature plus 0.3 times the C-ratings, divided by the sum of all 
A-, B- and C-ratings for the feature. This value is then multiplied by 100 and expressed as a 
percentage. A Pervasiveness value of 100% or close to 100% thus indicates that the feature is 
highly pervasive (rated A) in all or most of the varieties for which it is attested, while a 
value close to 30% (the lowest possible value) indicates that the feature is extremely rare 
(rated C) in most or all of the varieties for which it is attested. Intermediate values are less 
easy to interpret – here one has to look more closely at the ratio of A- to B- to C-values. 
Two more things should also be noted here:

- The Pervasiveness value does not provide information on how widespread a feature is in the entire 
  eWAVE dataset, i.e. for how many varieties the feature is actually attested.
- Since the eWAVE contributors did not all use exactly the same strategies in deciding when to 
  give a feature an A- vs. a B- or a C- vs. a B- rating, it is very difficult to translate the 
  ratings into numerical values that adequately reflect the differences between A-, B- and 
  C-ratings. The choice made here (1 for A, 0.6 for B and 0.3 for C) is certainly only one of 
  many, and further testing is required to see how adequate this model is.
""",
            },
        )
        ds['ParameterTable'].add_foreign_key('Category_ID',
                                             'featurecategories.csv', 'ID')

        # Values may have (many) examples:
        ds.add_columns(
            'ValueTable', {
                'name': 'Example_ID',
                'propertyUrl':
                'http://cldf.clld.org/v1.0/terms.rdf#exampleReference',
                'separator': ' ',
            })
        # ... but no Contribution_ID anymore:
        ds.remove_columns('ValueTable', 'Contribution_ID')

        # Examples may have sources:
        ds.add_columns(
            'ExampleTable', {
                'name': 'Source',
                'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source',
                'separator': ';',
            })

        history = ds.add_table('history.csv', 'Version', 'Language_ID',
                               'Parameter_ID', 'Code_ID')
        history.add_foreign_key('Language_ID', 'languages.csv', 'ID')
        history.add_foreign_key('Parameter_ID', 'parameters.csv', 'ID')
        history.add_foreign_key('Code_ID', 'codes.csv', 'ID')

        #
        # Now add the data:
        #
        ds.add_sources(self.raw_dir.read('sources.bib'))

        args.writer.objects['varietytypes.csv'] = [{
            'ID': r[0],
            'Name': r[1],
            'Description': r[2]
        } for r in self.raw_dir.read_csv('varietytype.psv', delimiter='|')]
        args.writer.objects['featurecategories.csv'] = [{
            'ID': r[0],
            'Name': r[1],
            'Description': r[2]
        } for r in self.raw_dir.read_csv('featurecategory.psv', delimiter='|')]
        args.writer.objects['regions.csv'] = [{
            'ID': r[0],
            'Name': r[1]
        } for r in self.raw_dir.read_csv('region.psv', delimiter='|')]

        for lid, pid, cid, _ in self.raw_dir.read_json('changes.json')['2013']:
            args.writer.objects['history.csv'].append({
                'Version':
                '1.0',
                'Language_ID':
                lid,
                'Parameter_ID':
                pid,
                'Code_ID':
                '{0}-{1}'.format(pid, cid.replace('?', 'NA'))
            })

        for row in self.raw_dir.read_csv('contributors.csv', dicts=True):
            #id, name, url, email, address
            args.writer.objects['contributors.csv'].append({
                'ID':
                row['id'],
                'Name':
                row['name'],
                'URL':
                row['url'],
                'Email':
                row['email'],
                'Address':
                row['address'],
            })

        # We read the bulk of the data from the CLDF export of the website:
        raw_ds = StructureDataset.from_metadata(
            self.raw_dir / 'StructureDataset-metadata.json')

        cc = {
            cid: [r[1] for r in rows]
            for cid, rows in itertools.groupby(
                sorted(self.raw_dir.read_csv('cc.csv'),
                       key=lambda r: (int(r[0]), int(r[2]), int(r[1]))),
                lambda r: r[0],
            )
        }
        desc = {
            r['ID']: r['Description']
            for r in self.raw_dir.read_csv('contributions.csv', dicts=True)
        }
        data = {r[0]: r[1:] for r in self.raw_dir.read_csv('variety.csv')}
        for row in raw_ds['LanguageTable']:
            row['Region_ID'] = data[row['ID']][0]
            row['Type_ID'] = data[row['ID']][1]
            row['abbr'] = data[row['ID']][2]
            row['Description'] = desc[row['ID']]
            row['Contributor_ID'] = cc[row['ID']]
            args.writer.objects['LanguageTable'].append(row)

        data = {r[0]: r[1:] for r in self.raw_dir.read_csv('feature.csv')}
        for row in raw_ds['ParameterTable']:
            row['Example_Source'] = data[row['ID']][0]
            row['Category_ID'] = data[row['ID']][1]
            row['Attestation'] = data[row['ID']][2]
            row['Pervasiveness'] = data[row['ID']][3]
            args.writer.objects['ParameterTable'].append(row)

        # Augment examples.csv
        def ref(r):
            return str(
                Reference(r['source'],
                          r['description'].replace('[', '(').replace(']',
                                                                     ')')))

        examplesource = {
            eid: [ref(r) for r in rows]
            for eid, rows in itertools.groupby(
                sorted(self.raw_dir.read_csv('examplesource.csv', dicts=True),
                       key=lambda d: (int(d['example']), d['source'])),
                lambda d: d['example'])
        }
        for row in raw_ds['ExampleTable']:
            row['Source'] = examplesource.get(row['ID'], [])
            args.writer.objects['ExampleTable'].append(row)

        # Renumber codes and values!
        for row in raw_ds['CodeTable']:
            row['ID'] = '{0}-{1}'.format(row['Parameter_ID'],
                                         row['Name'].replace('?', 'NA'))
            args.writer.objects['CodeTable'].append(row)

        valuesentence = {
            vid: [r['sentence'] for r in rows]
            for vid, rows in itertools.groupby(
                sorted(self.raw_dir.read_csv('valueexample.csv', dicts=True),
                       key=lambda d: (int(d['value']), int(d['sentence']))),
                lambda d: d['value'])
        }

        for row in raw_ds['ValueTable']:
            row['Example_ID'] = valuesentence.get(row['ID'], [])
            row['ID'] = '{0}-{1}'.format(row['Language_ID'],
                                         row['Parameter_ID'])
            row['Code_ID'] = '{0}-{1}'.format(row['Parameter_ID'], row['Value']
                                              or 'NA')
            args.writer.objects['ValueTable'].append(row)