Esempio n. 1
0
def add_meta_data(session):
    """
    Creates and adds to the given SQLAlchemy session the common.Dataset and
    related model instances that comprise the project's meta info.

    Helper for the main function that keeps the meta data in one place for
    easier reference and editing.
    """
    dataset = common.Dataset(
        id='northeuralex',
        name='NorthEuraLex',
        description='Lexicostatistical Database of Northern Eurasia',
        publisher_name=
        'Seminar für Sprachwissenschaft at the University of Tübingen',
        publisher_place='Tübingen',
        license='https://creativecommons.org/licenses/by-sa/4.0/',
        jsondata={
            'license_icon':
            'cc-by-sa.png',
            'license_name':
            'Creative Commons Attribution-ShareAlike 4.0 International License'
        },
        contact='*****@*****.**',
        domain='northeuralex.org')
    session.add(dataset)

    dataset.editors.append(
        common.Editor(contributor=common.Contributor(id='jdellert',
                                                     name='Johannes Dellert')))
    dataset.editors.append(
        common.Editor(
            contributor=common.Contributor(id='gjäger', name='Gerhard Jäger')))
def add_meta_data(session):
    """
    Creates and adds to the given SQLAlchemy session the common.Dataset and
    related model instances that comprise the project's meta info.
    Helper for the main function that keeps the meta data in one place for
    easier reference and editing.
    """
    dataset = common.Dataset(
        id='tuled',
        name='TuLeD',
        description='Tupían Lexical Database',
        publisher_name=
        'Seminar für Sprachwissenschaft at the University of Tübingen',
        publisher_place='Tübingen',
        license='https://creativecommons.org/licenses/by-sa/4.0/',
        jsondata={
            'license_icon':
            'cc-by-sa.png',
            'license_name':
            'Creative Commons Attribution-ShareAlike 4.0 International License'
        },
        contact='*****@*****.**',
        domain='xyz.org')
    session.add(dataset)

    dataset.editors.append(
        common.Editor(contributor=common.Contributor(
            id='fgerardi', name='Fabrício Ferraz Gerardi')))
    dataset.editors.append(
        common.Editor(contributor=common.Contributor(
            id='sreichert', name='Stanislav Reichert')))
Esempio n. 3
0
def main(args):
    data = Data()

    dataset = common.Dataset(
        id=u'An Crúbadán',
        name=u'An Crúbadán',
        publisher_name="Saint Louis University",
        publisher_place="Saint Louis, USA",
        publisher_url="http://www.slu.edu/",
        description=
        "Linguistic datasets for over 2000 languages created from web-crawled text corpora",
        contact="*****@*****.**",
        license='http://creativecommons.org/licenses/by/4.0/',
        jsondata={
            'license_icon':
            'https://licensebuttons.net/l/by/4.0/88x31.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License',
        },
        domain='crubadan.org',
    )

    DBSession.add(dataset)
    DBSession.flush()

    editor = data.add(common.Contributor,
                      "Kevin Scannell",
                      id="Kevin Scannell",
                      name="Kevin Scannell",
                      email="*****@*****.**")
    common.Editor(dataset=dataset, contributor=editor, ord=0)
    DBSession.flush()

    fillTable(DBSession)
Esempio n. 4
0
def _addEditor(dataset, count, lp):
    """For a lighter 'main' function."""
    eds = ['Frank Seifart', 'Ludger Paschen', 'Matthew Stave']
    ed = dorEditor(id=lp[0],
                   name=lp[0],
                   url=lp[1],
                   email=lp[2],
                   address=lp[3],
                   team=lp[4],
                   function=lp[5])
    if lp[0] in eds:
        common.Editor(dataset=dataset, contributor=ed, ord=count + 1)
        count += 1
    DBSession.add(ed)
    DBSession.flush()
    return dataset, count
Esempio n. 5
0
def main(args):
    data = Data()

    dataset = common.Dataset(id=moslex.__name__,
                             domain='moslex.clld.org',
                             name='MosLex',
                             license='https://creativecommons.org'
                             '/licenses/by-nc/4.0/',
                             jsondata={
                                 'license_icon':
                                 'cc-by-nc.png',
                                 'license_name':
                                 'Creative Commons '
                                 'Attribution-NC 4.0 '
                                 'International License'
                             })

    editor = data.add(common.Contributor,
                      'editor',
                      id='editor',
                      name='Alexei Kassian')
    common.Editor(dataset=dataset, contributor=editor)

    with open('languoids.json') as file:
        languoids = json.load(file)

    with open('concepts.json') as file:
        concepts = json.load(file)

    with open('forms.json') as file:
        forms = json.load(file)

    add_languoids(data, languoids)
    add_concepts(data, concepts)
    add_forms(data, forms)

    DBSession.add(dataset)
Esempio n. 6
0
def main(args):
    data = Data()

    dataset = common.Dataset(id=moslex.__name__,
                             domain='moslex.clld.org',
                             name='MosLex',
                             license='https://creativecommons.org'
                             '/licenses/by-nc/4.0/',
                             jsondata={
                                 'license_icon':
                                 'cc-by-nc.png',
                                 'license_name':
                                 'Creative Commons '
                                 'Attribution-NC 4.0 '
                                 'International License'
                             })

    editor = data.add(common.Contributor,
                      'editor',
                      id='editor',
                      name='Alexei Kassian',
                      email='*****@*****.**')
    common.Editor(dataset=dataset, contributor=editor)

    with open(os.environ['MOSLEX_CONCEPTS']) as file:
        concepts = json.load(file)
    add_concepts(data, concepts)

    data_folders = [
        path
        for path in glob.glob(os.path.join(os.environ['MOSLEX_DATA'], '*'))
        if os.path.isdir(path)
    ]
    for folder in data_folders:
        add_data_folder(folder, data)

    DBSession.add(dataset)
Esempio n. 7
0
def main(args):  # pragma: no cover
    data = Data()
    clts_repos = Path(__file__).parent.parent.parent.parent.resolve() / 'clts-data'
    clts_repos = CLTS(clts_repos)
    print(clts_repos.repos)
    version = 'v2.1.0' # assert_release(clts_repos.repos)

    for rec in Database.from_file(args.cldf.bibpath, lowercase=True):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    dataset = common.Dataset(
        id='clts',
        name="CLTS {0}".format(version),
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="http://www.eva.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        contact='*****@*****.**',
        domain='clts.clld.org',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})
    DBSession.add(dataset)
    for i, name in enumerate([
        'Johann-Mattis List',
        'Cormac Anderson',
        'Tiago Tresoldi',
        'Robert Forkel',
    ]):
        c = common.Contributor(id=slug(name), name=name)
        dataset.editors.append(common.Editor(contributor=c, ord=i))

    for line in args.cldf['data/features.tsv']:
        data.add(
            models.Feature,
            line['ID'],
            id=line['ID'],
            name='{} {}: {}'.format(line['TYPE'], line['FEATURE'], line['VALUE']),
            sound_type=line['TYPE'],
            feature=line['FEATURE'],
            value=line['VALUE'],
        )

    DBSession.add(models.SoundSegment(
        id='NA',
        name='<NA>',
        description='<NA>',
        type='marker',
        generated=True,
        unicode='',
        color='#bbbbbb',
    ))
    for line in args.cldf['data/sounds.tsv']:
        s = data.add(
            models.SoundSegment,
            line['ID'],
            id=line['ID'],
            name=line['GRAPHEME'],
            description=line['NAME'],
            type=line['TYPE'],
            generated=line['GENERATED'],
            unicode=' / '.join(line['UNICODE']),
            color=clts_repos.soundclass('color').resolve_sound(line['GRAPHEME']),
        )
        if s.color == '0':
            s.color = '#bbbbbb'
        assert s.color in LEGEND
    DBSession.flush()

    seen = set()
    for line in args.cldf['data/sounds.tsv']:
        for fid in line['FEATURES']:
            spk, fpk = data['SoundSegment'][line['ID']].pk, data['Feature'][fid].pk
            if (spk, fpk) not in seen:
                DBSession.add(models.SoundSegmentFeature(soundsegment_pk=spk, feature_pk=fpk))
                seen.add((spk, fpk))

    english = data.add(
        common.Language, 'eng',
        id='eng',
        name='English')

    for line in args.cldf['sources/index.tsv']:
        c = data.add(
            models.Transcription,
            line['NAME'],
            id=line['NAME'],
            name=line['NAME'],
            description=line['DESCRIPTION'].replace(':bib:', '/sources/'),
            datatype=getattr(models.Datatype, line['TYPE'])
        )
        for ref in line.get('REFS', []):
            common.ContributionReference(source=data['Source'][ref], contribution=c)

    sound_url_template = args.cldf['data/graphemes.tsv', 'SOUND'].valueUrl
    image_url_template = args.cldf['data/graphemes.tsv', 'IMAGE'].valueUrl

    for line in args.cldf['data/graphemes.tsv']:
        key = line['DATASET'] + ':' + line['NAME'] + ':' + line['GRAPHEME']
        if key not in data['Grapheme']:
            sound_id = line['NAME'].replace(' ', '_')
            vs = data['ValueSet'].get((line['DATASET'], line['NAME']))
            if not vs:
                try:
                    vs = data.add(
                        common.ValueSet,
                        (line['DATASET'], line['NAME']),
                        id=key,
                        description=line['NAME'],
                        language=english,
                        contribution=data['Transcription'][line['DATASET']],
                        parameter=data['SoundSegment'][sound_id]
                    )
                except:
                    print(line)
                    raise
            data.add(
                models.Grapheme,
                key,
                id=key,
                name=line['GRAPHEME'],
                description=line['NAME'],
                url=line['URL'].unsplit() if line['URL'] else None,
                audio=sound_url_template.expand(line) if line['SOUND'] else None,
                image=image_url_template.expand(line) if line['IMAGE'] else None,
                valueset=vs
            )
Esempio n. 8
0
def load(args):
    glottolog = args.repos
    fts.index('fts_index', models.Ref.fts, DBSession.bind)
    DBSession.execute("CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;")
    version = assert_release(glottolog.repos)
    dataset = common.Dataset(
        id='glottolog',
        name="{0} {1}".format(glottolog.publication.web.name, version),
        publisher_name=glottolog.publication.publisher.name,
        publisher_place=glottolog.publication.publisher.place,
        publisher_url=glottolog.publication.publisher.url,
        license=glottolog.publication.license.url,
        domain=purl.URL(glottolog.publication.web.url).domain(),
        contact=glottolog.publication.web.contact,
        jsondata={'license_icon': 'cc-by.png', 'license_name': glottolog.publication.license.name},
    )
    data = Data()

    for e in glottolog.editors.values():
        if e.current:
            ed = data.add(common.Contributor, e.id, id=e.id, name=e.name)
            common.Editor(dataset=dataset, contributor=ed, ord=int(e.ord))
    DBSession.add(dataset)

    contrib = data.add(common.Contribution, 'glottolog', id='glottolog', name='Glottolog')
    DBSession.add(common.ContributionContributor(
        contribution=contrib, contributor=data['Contributor']['hammarstroem']))

    #
    # Add Parameters:
    #
    add = functools.partial(add_parameter, data)
    add('fc', name='Family classification')
    add('sc', name='Subclassification')
    add('aes',
        args.repos.aes_status.values(),
        name=args.repos.aes_status.__defaults__['name'],
        pkw=dict(
            jsondata=dict(
                reference_id=args.repos.aes_status.__defaults__['reference_id'],
                sources=[attr.asdict(v) for v in args.repos.aes_sources.values()],
                scale=[attr.asdict(v) for v in args.repos.aes_status.values()])),
        dekw=lambda de: dict(name=de.name, number=de.ordinal, jsondata=dict(icon=de.icon)),
    )
    add('med',
        args.repos.med_types.values(),
        name='Most Extensive Description',
        dekw=lambda de: dict(
            name=de.name, description=de.description, number=de.rank, jsondata=dict(icon=de.icon)),
    )
    add('macroarea',
        args.repos.macroareas.values(),
        pkw=dict(
            description=args.repos.macroareas.__defaults__['description'],
            jsondata=dict(reference_id=args.repos.macroareas.__defaults__['reference_id'])),
        dekw=lambda de: dict(
            name=de.name,
            description=de.description,
            jsondata=dict(geojson=read_macroarea_geojson(args.repos, de.name, de.description)),
        ),
    )
    add('ltype',
        args.repos.language_types.values(),
        name='Language Type',
        dekw=lambda de: dict(name=de.category, description=de.description),
        delookup='category',
    )
    add('country',
        args.repos.countries,
        dekw=lambda de: dict(name=de.id, description=de.name),
    )

    legacy = jsonlib.load(gc2version(args))
    for gc, version in legacy.items():
        data.add(models.LegacyCode, gc, id=gc, version=version)

    #
    # Now load languoid data, keeping track of relations that can only be inserted later.
    #
    lgsources = defaultdict(list)
    # Note: We rely on languoids() yielding languoids in the "right" order, i.e. such that top-level
    # nodes will precede nested nodes. This order must be preserved using an `OrderedDict`:
    nodemap = OrderedDict([(l.id, l) for l in glottolog.languoids()])
    lgcodes = {k: v.id for k, v in args.repos.languoids_by_code(nodemap).items()}
    for lang in nodemap.values():
        for ref in lang.sources:
            lgsources['{0.provider}#{0.bibkey}'.format(ref)].append(lang.id)
        load_languoid(glottolog, data, lang, nodemap)

    for gc in glottolog.glottocodes:
        if gc not in data['Languoid'] and gc not in legacy:
            common.Config.add_replacement(gc, None, model=common.Language)

    for obj in jsonlib.load(glottolog.references_path('replacements.json')):
        common.Config.add_replacement(
            '{0}'.format(obj['id']),
            '{0}'.format(obj['replacement']) if obj['replacement'] else None,
            model=common.Source)

    DBSession.flush()

    for doctype in glottolog.hhtypes:
        data.add(
            models.Doctype, doctype.id, id=doctype.id,
            name=doctype.name,
            description=doctype.description,
            abbr=doctype.abbv,
            ord=doctype.rank)

    for bib in glottolog.bibfiles:
        data.add(
            models.Provider,
            bib.id,
            id=bib.id,
            name=bib.title,
            description=bib.description,
            abbr=bib.abbr,
            url=bib.url)
    DBSession.flush()

    s = time()
    for i, entry in enumerate(
            BibFile(glottolog.build_path('monster-utf8.bib'), api=glottolog).iterentries()):
        if i % 10000 == 0:
            args.log.info('{0}: {1:.3}'.format(i, time() - s))
            s = time()
        ref = load_ref(data, entry, lgcodes, lgsources)
        if 'macro_area' in entry.fields:
            mas = []
            for ma in split_text(entry.fields['macro_area'], separators=',;', strip=True):
                ma = 'North America' if ma == 'Middle America' else ma
                ma = glottolog.macroareas.get('Papunesia' if ma == 'Papua' else ma)
                mas.append(ma.name)
            ref.macroareas = ', '.join(mas)
Esempio n. 9
0
def main(args):
    _ = args
    data = Data()
    cldf_data = args.cldf

    data.add(common.Contributor,
             'fehnannemarie',
             id='fehnannemarie',
             name="Anne-Marie Fehn",
             url="https://shh.mpg.de")

    # TODO: Editors/Contributors
    dataset = common.Dataset(id=kba.__name__,
                             name="KBA",
                             publisher_name="Max Planck Institute for the "
                             "Science of Human History",
                             publisher_place="Jena",
                             publisher_url="http://www.shh.mpg.de",
                             license="http://creativecommons.org/licenses/by"
                             "/4.0/",
                             domain='kba.clld.org',
                             jsondata={
                                 'license_icon':
                                 'cc-by.png',
                                 'license_name':
                                 'Creative Commons '
                                 'Attribution 4.0 '
                                 'International '
                                 'License'
                             })

    DBSession.add(dataset)

    for i, editor in enumerate(['fehnannemarie']):
        common.Editor(dataset=dataset,
                      contributor=data['Contributor'][editor],
                      ord=i + 1)

    contrib = common.Contribution(id='contrib', name='the contribution')

    for language in cldf_data['LanguageTable']:
        lang = data.add(models.KbaLanguage,
                        language['ID'],
                        id=language['ID'],
                        name=language['Name'])
        add_language_codes(data, lang, None, glottocode=language['Glottocode'])

    # TODO: Concepticon
    for parameter in cldf_data['ParameterTable']:
        data.add(common.Parameter,
                 parameter['ID'],
                 id=parameter['ID'],
                 name='{0} ({1})'.format(parameter['Name'], parameter['ID']))

    for form in cldf_data['FormTable']:
        valueset_id = '{0}-{1}'.format(form['Parameter_ID'],
                                       form['Language_ID'])
        valueset = data['ValueSet'].get(valueset_id)

        # Unless we already have something in the VS:
        if not valueset:
            valueset = data.add(
                common.ValueSet,
                valueset_id,
                id=valueset_id,
                language=data['KbaLanguage'][form['Language_ID']],
                parameter=data['Parameter'][form['Parameter_ID']],
                contribution=contrib)

        DBSession.add(
            models.Word(id=form['ID'],
                        name=form['Form'],
                        comment=form.get('Comment'),
                        sourceorthography=form.get('sourceorthography'),
                        kbaorthography=form.get('kbaorthography'),
                        wordclass=form.get('wordclass'),
                        grammaticalnotes=form.get('grammaticalnotes'),
                        idiolectalvariant=form.get('idiolectalvariant'),
                        originaltranslation=form.get('originaltranslation'),
                        valueset=valueset))

    load_families(data,
                  [(l.glottocode, l) for l in data['KbaLanguage'].values()],
                  glottolog_repos=args.glottolog,
                  isolates_icon='tcccccc')
Esempio n. 10
0
def main(args):
    data = Data()
    ds = Pofatu(
        pathlib.Path(pofatu.__file__).parent.parent.parent / 'pofatu-data')

    dataset = common.Dataset(
        id=pofatu.__name__,
        name="POFATU",
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="https://www.eva.mpg.de",
        license="https://creativecommons.org/licenses/by/4.0/",
        domain='pofatu.clld.org',
        contact='*****@*****.**',
        jsondata={
            'license_icon':
            'cc-by.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License'
        })

    for i, (id_, name) in enumerate([
        ('hermannaymeric', 'Aymeric Hermann'),
        ('forkelrobert', 'Robert Forkel'),
    ]):
        ed = data.add(common.Contributor, id_, id=id_, name=name)
        common.Editor(dataset=dataset, contributor=ed, ord=i + 1)
    DBSession.add(dataset)

    for rec in ds.iterbib():
        rec.genre = bibtex.EntryType.from_string(
            ENTRY_TYPES.get(rec.genre, rec.genre))
        if 'date' in rec:
            rec['year'] = rec.pop('date')
        data.add(common.Source,
                 rec.id,
                 _obj=bibtex2source(rec, lowercase_id=False))

    analyses = list(ds.iterdata())

    def midpoint(coords):
        p = MultiPoint([(lat, lon + 360 if lon < 0 else lon)
                        for lat, lon in coords]).convex_hull
        #geojson = {
        #    'type': 'Feature',
        #    'properties': {},
        #    'geometry': mapping(p)}
        c = p.centroid
        return c.x, (c.y - 360) if c.y > 180 else c.y

    artefacts = collections.defaultdict(dict)
    midpoints = {}
    for a in analyses:
        l = a.sample.location
        lid = l.id
        if lid not in midpoints:
            midpoints[lid] = set()
        if l.latitude is not None and l.longitude is not None:
            midpoints[lid].add((l.latitude, l.longitude))
        art = a.sample.artefact
        for attr_ in ['name', 'category', 'collection_type']:
            if not artefacts[slug(art.id)].get(attr_):
                artefacts[slug(art.id)][attr_] = getattr(art, attr_)

    midpoints = {
        k: midpoint(v) if v else (None, None)
        for k, v in midpoints.items()
    }

    for analysis in analyses:
        loc = analysis.sample.location
        if loc.id not in data['Location']:
            data.add(
                models.Location,
                loc.id,
                id=valid_id(loc.id),
                name=loc.label,
                latitude=midpoints[loc.id][0],
                longitude=midpoints[loc.id][1],
                region=loc.region.replace('_', ' '),
                subregion=loc.subregion,
                location=loc.locality,
            )

    # Add contributions
    for contrib in ds.itercontributions():
        contribution = data.add(
            common.Contribution,
            contrib.id,
            id=valid_id(contrib.id),
            name=contrib.label,
            description=contrib.description,
        )
        DBSession.flush()
        for i, name in enumerate(contrib.contributors):
            cid = slug(name)
            co = data['Contributor'].get(cid)
            if not co:
                co = data.add(common.Contributor, cid, id=cid, name=name)
            common.ContributionContributor(ord=i,
                                           contribution=contribution,
                                           contributor=co)

        for ref in contrib.source_ids:
            DBSession.add(
                common.ContributionReference(
                    contribution=contribution,
                    source=data['Source'][ref],
                ))
            data['Contribution'][ref] = contribution

    methods = collections.defaultdict(list)
    for method in ds.itermethods():
        m = data.add(
            models.Method,
            method.id,
            id=valid_id(method.id),
            name=method.label,
            code=method.code,
            parameter=method.parameter.strip(),
            instrument=method.instrument,
            number_of_replicates=method.number_of_replicates,
            date=method.date,
            comment=method.comment,
            detection_limit=method.detection_limit,
            detection_limit_unit=method.detection_limit_unit,
            total_procedural_blank_value=method.total_procedural_blank_value,
            total_procedural_unit=method.total_procedural_unit,
        )
        methods[(m.code.lower(), m.parameter.lower())].append(m)
        for ref in method.references:
            DBSession.add(
                models.MethodReference(
                    method=m,
                    sample_name=ref.sample_name,
                    sample_measured_value=ref.sample_measured_value,
                    uncertainty=ref.uncertainty,
                    uncertainty_unit=ref.uncertainty_unit,
                    number_of_measurements=ref.number_of_measurements,
                ))
        for ref in method.normalizations:
            DBSession.add(
                models.Normalization(
                    method=m,
                    reference_sample_name=ref.reference_sample_name,
                    reference_sample_accepted_value=ref.
                    reference_sample_accepted_value,
                    citation=ref.citation,
                ))

    parameter = data.add(common.Parameter,
                         'c',
                         id='category',
                         name='Sample category')
    for i, opt in enumerate(attr.fields_dict(
            pypofatu.models.Sample)['sample_category'].validator.options,
                            start=1):
        data.add(common.DomainElement,
                 opt,
                 parameter=parameter,
                 id=str(i),
                 name=opt)

    DBSession.flush()
    assert parameter.pk

    # Add Samples and UnitParameters and Measurements
    for analysis in analyses:
        sample = analysis.sample
        vsid = '{0}-{1}'.format(sample.location.id,
                                data['Contribution'][sample.source_id].id)
        vs = data['ValueSet'].get(vsid)
        if not vs:
            vs = data.add(
                common.ValueSet,
                vsid,
                id=valid_id(vsid),
                language_pk=data['Location'][sample.location.id].pk,
                parameter_pk=parameter.pk,
                contribution_pk=data['Contribution'][sample.source_id].pk,
            )
        v = data['Sample'].get(sample.id)
        if not v:
            v = data.add(
                models.Sample,
                sample.id,
                id=valid_id(sample.id),
                name=sample.id,
                sample_name=sample.sample_name,
                sample_comment=sample.sample_comment,
                petrography=sample.petrography,
                latitude=sample.location.latitude,
                longitude=sample.location.longitude,
                elevation=sample.location.elevation,
                location_comment=sample.location.comment,
                site_name=sample.site.name,
                site_code=sample.site.code,
                site_context=sample.site.context,
                site_comment=sample.site.comment,
                site_stratigraphic_position=sample.site.stratigraphic_position,
                site_stratigraphy_comment=sample.site.stratigraphy_comment,
                domainelement=data['DomainElement'][sample.sample_category],
                valueset=vs,
                artefact_id=sample.artefact.id,
                artefact_name=sample.artefact.name,
                artefact_category=sample.artefact.category,
                artefact_comment=sample.artefact.comment,
                artefact_attributes=sample.artefact.attributes,
                artefact_collector=sample.artefact.collector,
                artefact_collection_type=sample.artefact.collection_type,
                artefact_collection_location=sample.artefact.
                collection_location,
                artefact_collection_comment=sample.artefact.collection_comment,
                artefact_fieldwork_date=sample.artefact.fieldwork_date,
            )
            DBSession.add(
                models.SampleReference(
                    description='sample',
                    sample=v,
                    source=data['Source'][sample.source_id]))
            for ref in sample.artefact.source_ids:
                DBSession.add(
                    models.SampleReference(description='artefact',
                                           sample=v,
                                           source=data['Source'][ref]))
            for ref in sample.site.source_ids:
                DBSession.add(
                    models.SampleReference(description='site',
                                           sample=v,
                                           source=data['Source'][ref]))

        a = data.add(
            models.Analysis,
            analysis.id,
            id=better_slug(analysis.id),
            name=analysis.id,
            sample=v,
        )

        for i, measurement in enumerate(analysis.measurements):
            if i == 0:
                method = measurement.method
                if method:
                    a.analyzed_material_1 = method.analyzed_material_1,
                    a.analyzed_material_2 = method.analyzed_material_2,
                    a.sample_preparation = method.sample_preparation,
                    a.chemical_treatment = method.chemical_treatment,
                    a.technique = method.technique,
                    a.laboratory = method.laboratory,
                    a.analyst = method.analyst,

            pid = slug(measurement.parameter, lowercase=False)
            p = data['Param'].get(pid)
            if not p:
                p = data.add(models.Param,
                             pid,
                             id=pid,
                             name=measurement.parameter)
            data.add(
                models.Measurement,
                None,
                id='{0}-{1}'.format(a.id, p.id),
                analysis=a,
                method=data['Method'].get(measurement.method.id)
                if measurement.method else None,
                value=measurement.value,
                less=measurement.less,
                precision=measurement.value_sd,
                sigma=measurement.sd_sigma,
                unitparameter=p,
            )
Esempio n. 11
0
def load(args):
    fts.index('fts_index', models.Ref.fts, DBSession.bind)
    DBSession.execute("CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;")

    dataset = common.Dataset(
        id='glottolog',
        name="Glottolog {0}".format(args.args[0]),
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="https://shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        domain='glottolog.org',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})

    data = Data()
    for i, (id_, name) in enumerate([
        ('hammarstroem', 'Harald Hammarström'),
        ('bank', 'Sebastian Bank'),
        ('forkel', 'Robert Forkel'),
        ('haspelmath', 'Martin Haspelmath'),
    ]):
        ed = data.add(common.Contributor, id_, id=id_, name=name)
        common.Editor(dataset=dataset, contributor=ed, ord=i + 1)
    DBSession.add(dataset)

    clf = data.add(common.Contribution, 'clf', id='clf', name='Classification')
    DBSession.add(common.ContributionContributor(
        contribution=clf, contributor=data['Contributor']['hammarstroem']))

    for pid, pname in [
        ('fc', 'Family classification'),
        ('sc', 'Subclassification'),
        ('vitality', 'Degree of endangerment'),
    ]:
        data.add(common.Parameter, pid, id=pid, name=pname)

    legacy = jsonlib.load(gc2version(args))
    for gc, version in legacy.items():
        data.add(models.LegacyCode, gc, id=gc, version=version)

    glottolog = args.repos
    for ma in Macroarea:
        data.add(
            models.Macroarea,
            ma.name,
            id=ma.name,
            name=ma.value,
            description=ma.description)

    for country in glottolog.countries:
        data.add(models.Country, country.id, id=country.id, name=country.name)

    lgcodes, mas, countries, lgsources = {}, {}, {}, defaultdict(list)
    languoids = list(glottolog.languoids())
    nodemap = {l.id: l for l in languoids}
    for lang in languoids:
        for ref in lang.sources:
            lgsources['{0.provider}#{0.bibkey}'.format(ref)].append(lang.id)
        load_languoid(data, lang, nodemap)
        mas[lang.id] = [ma.name for ma in lang.macroareas]
        countries[lang.id] = [c.id for c in lang.countries]
        lgcodes[lang.id] = lang.id
        if lang.hid:
            lgcodes[lang.hid] = lang.id
        if lang.iso:
            lgcodes[lang.iso] = lang.id

    for gc in glottolog.glottocodes:
        if gc not in data['Languoid'] and gc not in legacy:
            common.Config.add_replacement(gc, None, model=common.Language)

    for obj in jsonlib.load(glottolog.references_path('replacements.json')):
        common.Config.add_replacement(
            '{0}'.format(obj['id']),
            '{0}'.format(obj['replacement']) if obj['replacement'] else None,
            model=common.Source)

    DBSession.flush()
    for lid, maids in mas.items():
        for ma in maids:
            DBSession.add(models.Languoidmacroarea(
                languoid_pk=data['Languoid'][lid].pk,
                macroarea_pk=data['Macroarea'][ma].pk))

    for lid, cids in countries.items():
        for cid in cids:
            DBSession.add(models.Languoidcountry(
                languoid_pk=data['Languoid'][lid].pk,
                country_pk=data['Country'][cid].pk))

    for doctype in glottolog.hhtypes:
        data.add(
            models.Doctype, doctype.id, id=doctype.id,
            name=doctype.name,
            description=doctype.description,
            abbr=doctype.abbv,
            ord=doctype.rank)

    for bib in glottolog.bibfiles:
        data.add(
            models.Provider,
            bib.id,
            id=bib.id,
            name=bib.title,
            description=bib.description,
            abbr=bib.abbr,
            url=bib.url)
    DBSession.flush()

    s = time()
    for i, entry in enumerate(
            BibFile(glottolog.build_path('monster-utf8.bib')).iterentries()):
        if i % 10000 == 0:
            args.log.info('{0}: {1:.3}'.format(i, time() - s))
            s = time()
        ref = load_ref(data, entry, lgcodes, lgsources)
        if 'macro_area' in entry.fields:
            for ma in split_text(entry.fields['macro_area'], separators=',;', strip=True):
                ma = 'North America' if ma == 'Middle America' else ma
                ma = Macroarea.get('Papunesia' if ma == 'Papua' else ma)
                DBSession.add(models.Refmacroarea(
                    ref_pk=ref.pk, macroarea_pk=data['Macroarea'][ma.name].pk))
Esempio n. 12
0
def create(args):
    args.log.info('starting migration ...')
    data = Data()
    db = create_engine('postgresql://robert@/glottolog2')

    with transaction.manager:
        sn = data.add(common.Contributor,
                      'sn',
                      id='sn',
                      name='Sebastian Nordhoff')
        hh = data.add(common.Contributor,
                      'hh',
                      id='hh',
                      name='Harald Hammarström')
        rf = data.add(common.Contributor,
                      'rf',
                      id='rf',
                      name='Robert Forkel',
                      url="https://github.com/xrotwang")
        mh = data.add(common.Contributor,
                      'mh',
                      id='mh',
                      name='Martin Haspelmath')
        contrib = data.add(common.Contribution,
                           'c',
                           id='classification',
                           name='Classification')
        data.add(common.ContributionContributor,
                 'hh',
                 contribution=contrib,
                 contributor=hh)
        params = dict(
            fc=data.add(common.Parameter,
                        'fc',
                        id='fc',
                        name='Family classification'),
            sc=data.add(common.Parameter,
                        'sc',
                        id='sc',
                        name='Subclassification'),
        )

        dataset = data.add(
            common.Dataset,
            'd',
            id='glottolog',
            name='Glottolog 2.0',
            description='',
            published=datetime.date(2013, 8, 15),
            domain='glottolog.org',
            contact='*****@*****.**',
            jsondata={
                'license_icon':
                'http://i.creativecommons.org/l/by-sa/3.0/88x31.png',
                'license_name':
                'Creative Commons Attribution-ShareAlike 3.0 Unported License'
            })
        for i, ed in enumerate([sn, hh, rf, mh]):
            DBSession.add(
                common.Editor(dataset=dataset, contributor=ed, ord=i + 1))

        valuesets = {}

        def create_languoid(row, father_pk=None):
            glottocode = {
                'akun1242': 'akun1241'
            }.get(row['alnumcode'], row['alnumcode'])
            attrs = dict(
                pk=row['id'],
                id=glottocode,
                name=row['primaryname'],
                description=row['globalclassificationcomment'],
                level=getattr(models2.LanguoidLevel, row['level']),
                status=getattr(models2.LanguoidStatus,
                               (row['status'] or '').replace(' ', '_'), None),
                father_pk=father_pk,
                created=row['updated'],
                jsondata={} if not row['hname'] else {'hname': row['hname']},
            )
            for attr in [
                    'active',
                    'updated',
                    'hid',
                    'latitude',
                    'longitude',
            ]:
                attrs[attr] = row[attr]
            l = data.add(models2.Languoid, row['id'], **attrs)
            for type_ in params:
                id_ = '%s%s' % (type_, row['id'])
                vs = data.add(common.ValueSet,
                              id_,
                              id=id_,
                              description=row['classificationcomment'] if type_
                              == 'fc' else row['subclassificationcomment'],
                              language=l,
                              parameter=params[type_],
                              contribution=contrib)
                data.add(common.Value,
                         id_,
                         id=id_,
                         name='%s - %s' % (row['level'], row['status']),
                         valueset=vs)
                DBSession.flush()
                valuesets[id_] = vs.pk
            return str(row['id'])

        level = 0
        parents = [
            create_languoid(row) for row in db.execute(
                'select * from languoidbase where father_id is null')
        ]
        while parents:
            args.log.info('level: %s' % level)
            level += 1
            parents = [
                create_languoid(
                    row, father_pk=data['Languoid'][row['father_id']].pk)
                for row in db.execute(
                    'select * from languoidbase where father_id in (%s)' %
                    ','.join(parents))
            ]

    def handler(offset, batch):
        svalues = []
        rvalues = []
        for row in batch:
            jsondata = json.loads(row['jsondata'] or "{}")
            jsondata['bibtexkey'] = row['bibtexkey']
            dicts = {
                's':
                dict(pk=row['id'],
                     polymorphic_type='base',
                     id=str(row['id']),
                     name='%(author)s %(year)s' % row,
                     description=row['title'],
                     bibtex_type=getattr(EntryType, row['type']),
                     jsondata=jsondata),
                'r':
                dict(pk=row['id']),
            }
            for model, map_ in {
                    's': {
                        'author': None,
                        'yearstring': 'year',
                        'year': 'year_int',
                        'startpage': 'startpage_int',
                        'numberofpages': 'pages_int',
                        'pages': None,
                        'edition': None,
                        'school': None,
                        'address': None,
                        'url': None,
                        'note': None,
                        'number': None,
                        'series': None,
                        'editor': None,
                        'booktitle': None,
                        'journal': None,
                        'volume': None,
                        'publisher': None,
                    },
                    'r': {
                        'endpage': 'endpage_int',
                        'inlg': None,
                        'inlg_code': None,
                        'subject': None,
                        'subject_headings': None,
                        'keywords': None,
                        'normalizedauthorstring': None,
                        'normalizededitorstring': None,
                        'ozbib_id': None,
                    }
            }.items():
                for okey, nkey in map_.items():
                    dicts[model][nkey or okey] = row[okey]
            svalues.append(dicts['s'])
            rvalues.append(dicts['r'])
        DBSession.execute(common.Source.__table__.insert(), svalues)
        DBSession.execute(models2.Ref.__table__.insert(), rvalues)

    select(db, 'select * from refbase order by id', handler)
    DBSession.execute('COMMIT')

    for table, model, value, order in [
        ('macroarea', models2.Macroarea,
         lambda i, row: dict(pk=row['id'],
                             id=slug(row['name']),
                             name=row['name'],
                             description=row['description']), None),
        ('country', models2.Country,
         lambda i, row: dict(pk=row['id'], id=row['alpha2'], name=row['name']),
         None),
        ('provider', models2.Provider,
         lambda i, row: dict(pk=row['id'],
                             id=slug(row['name']),
                             name=row['description'],
                             description=row['comment'],
                             abbr=row['abbr'],
                             url=row['url'],
                             refurl=row['refurl'],
                             bibfield=row['bibfield']), None),
        ('doctype', models2.Doctype,
         lambda i, row: dict(pk=row['id'],
                             id=slug(row['name']),
                             abbr=row['abbr'],
                             name=row['name'],
                             description=row['description']), None),
        ('refprovider', models2.Refprovider, lambda i, row: dict(
            pk=i, provider_pk=row['provider_id'], ref_pk=row['refbase_id']),
         ('provider_id', 'refbase_id')),
        ('refdoctype', models2.Refdoctype, lambda i, row: dict(
            pk=i, doctype_pk=row['doctype_id'], ref_pk=row['refbase_id']),
         ('doctype_id', 'refbase_id')),
    ]:
        insert(db, table, model, value, order=order)

    names = dict(
        (int(d['id']), d['pk'])
        for d in insert(db,
                        'namebase',
                        common.Identifier,
                        lambda i, row: dict(pk=i,
                                            id=str(row['id']),
                                            name=row['namestring'],
                                            type='name',
                                            description=row['nameprovider'],
                                            lang=row['inlg'] if row['inlg'] and
                                            len(row['inlg']) <= 3 else 'en'),
                        order='id'))

    codes = dict(
        (int(d['id']), d['pk'])
        for d in insert(db,
                        'codebase',
                        common.Identifier,
                        lambda i, row: dict(pk=i,
                                            id=str(row['id']),
                                            name=row['codestring'],
                                            type=common.IdentifierType.iso.
                                            value if row['codeprovider'] ==
                                            'ISO' else row['codeprovider']),
                        start=len(names),
                        order='id'))

    res = insert(
        db, 'nodecodes', common.LanguageIdentifier,
        lambda i, row: dict(pk=i,
                            language_pk=row['languoidbase_id'],
                            identifier_pk=codes[row['codebase_id']]))

    insert(db,
           'nodenames',
           common.LanguageIdentifier,
           lambda i, row: dict(pk=i,
                               language_pk=row['languoidbase_id'],
                               identifier_pk=names[row['namebase_id']]),
           start=len(res))

    for table, model, value in [
        ('languoidmacroarea', models2.Languoidmacroarea,
         lambda i, row: dict(pk=i,
                             languoid_pk=row['languoidbase_id'],
                             macroarea_pk=row['macroarea_id'])),
        ('languoidcountry', models2.Languoidcountry,
         lambda i, row: dict(pk=i,
                             languoid_pk=row['languoidbase_id'],
                             country_pk=row['country_id'])),
        ('noderefs', common.LanguageSource,
         lambda i, row: dict(pk=i,
                             language_pk=row['languoidbase_id'],
                             source_pk=row['refbase_id'])),
        ('refmacroarea', models2.Refmacroarea, lambda i, row: dict(
            pk=i, macroarea_pk=row['macroarea_id'], ref_pk=row['refbase_id'])),
        ('refcountry', models2.Refcountry, lambda i, row: dict(
            pk=i, country_pk=row['country_id'], ref_pk=row['refbase_id'])),
        ('spuriousreplacements', models2.Superseded,
         lambda i, row: dict(pk=i,
                             languoid_pk=row['languoidbase_id'],
                             replacement_pk=row['replacement_id'],
                             description=row['relation'])),
        ('justification', common.ValueSetReference, lambda i, row: dict(
            pk=i,
            valueset_pk=valuesets['%s%s' % ('fc' if row[
                'type'] == 'family' else 'sc', row['languoidbase_id'])],
            source_pk=row['refbase_id'],
            description=row['pages'])),
    ]:
        insert(db, table, model, value)
Esempio n. 13
0
def main(args):

    data = Data()

    dataset = common.Dataset(
        id=amsd.__name__,
        name="AMSD",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        domain='amsd.clld.org',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})

    DBSession.add(dataset)

    editors = OrderedDict([('Piers Kelly', None)])

    # data_entry => Contributor
    for row in sorted(dicts('data_entry'), key=lambda x: [
                x['name'].lower()] ):
        if row['name'] in editors:
            editors[row['name']] = row['pk']
        data.add(
            common.Contributor,
            row['pk'],
            id=row['pk'],
            name=row['name']
        )

    for i, cid in enumerate(editors.values()):
        common.Editor(dataset=dataset, contributor=data['Contributor'][cid], ord=i + 1)

    for row in dicts('source_citation'):
        data.add(
            common.Source,
            row['pk'],
            id=row['pk'],
            note=row['name'],
            name=row['name'],
        )

    for row in dicts('ling_area'):
        data.add(
            models.ling_area,
            row['pk'],
            chirila_name = row['chirila_name'],
            austlang_code = row['austlang_code'],
            austlang_name = row['austlang_name'],
            glottolog_code = row['glottolog_code'],
        )
    fd = {}
    for row in dicts('linked_filenames'):
        if row['name'] not in ['00-Text_reference.png', '00-No_image_available.png']:
            fd[row['pk']] = dict(
                name = row['name'],
                oid = row['oid'],
                path = row['path'],
                mimetype = mimetypes.guess_type(row['path'])[0],
            )

    for m in 'item_type technique keywords material source_type sem_domain holder_file'.split():
        for row in dicts(m):
            data.add(
                getattr(models, m),
                row['pk'],
                name = row['name'],
            )

    DBSession.flush()

    # sticks => MessageStick
    no_fts_cols = ['pk', 'latitude', 'longitude', 'item_type',
        'irn', 'data_entry', 'dim_1', 'dim_2', 'dim_3', 'data_entry',
        'ling_area_1', 'ling_area_2', 'ling_area_3', 'holder_file']
    x_cols = ['sem_domain', 'material', 'source_type', 'technique', 'keywords', 'holder_file', 'item_type']
    for i, row in enumerate(dicts('sticks')):

        fts_items = []
        for col in row.keys():
            if col:
                if col == 'amsd_id':
                    fts_items.append(row['amsd_id'].replace('.', '_') or "amsd_{:05d}".format(i),)
                elif col not in no_fts_cols and not col.endswith('_pk'):
                    fts_items.append(row[col])

        for t in x_cols:
            if row[t]:
                for _, k in enumerate(row[t].split(';')):
                    fts_items.append(str(data[t][k]))
                    fts_items.extend(str(data[t][k]).split('_'))

        for t in ['ling_area_1', 'ling_area_2', 'ling_area_3']:
            if row[t]:
                for _, k in enumerate(row[t].split(';')):
                    fts_items.append(data['ling_area'][k].chirila_name)
                    fts_items.append(data['ling_area'][k].austlang_code)
                    fts_items.append(data['ling_area'][k].austlang_name)
                    fts_items.append(data['ling_area'][k].glottolog_code)

        if row['source_citation']:
            for k in row['source_citation'].split(';'):
                data.add(
                    common.ContributionReference,
                    k,
                    contribution_pk = int(row['pk']),
                    source_pk = int(k),
                )
                fts_items.append(str(data['Source'][k]))

        if row['linked_filenames']:
            for j, k in enumerate(row['linked_filenames'].split(';')):
                if k in fd:
                    oid = fd[k].get('oid')
                    mt = fd[k].get('mimetype')
                    refobjid = ''
                    if mt == 'application/pdf':
                        refobjid = oid
                        # use for web, thumbnail a place holder image
                        oid = 'EAEA0-52CC-0295-6B71-0'
                    n = fd[k].get('name')
                    data.add(
                        common.Contribution_files,
                        k,
                        id='%s-%s-%i' % (k, row['pk'], j),
                        object_pk = int(row['pk']),
                        name = n,
                        jsondata = dict(
                                original = fd[k].get('path'),
                                objid = oid,
                                refobjid = refobjid,
                                web = 'web.jpg',
                                thumbnail = 'thumbnail.jpg',
                            ),
                        ord=j,
                        mime_type = mt,
                    )
                    fts_items.append(n)
                    fts_items.extend(nfilter(re.split('[_\-\.]', n)))

        data.add(
            models.MessageStick,
            row['pk'],
            id = row['amsd_id'].replace('.', '_') or "amsd_{:05d}".format(i),
            title = row['title'],
            description = row['description'],
            obj_creator = row['obj_creator'],
            date_created = row['date_created'],
            note_place_created = row['note_place_created'],
            place_created = row['place_created'],
            item_type_pk = row['item_type'] or None,
            ling_area_1_pk = row['ling_area_1'] or None,
            ling_area_2_pk = row['ling_area_2'] or None,
            ling_area_3_pk = row['ling_area_3'] or None,
            notes_ling_area = row['notes_ling_area'],
            stick_term = row['stick_term'],
            message = row['message'],
            motifs = row['motifs'],
            motif_transcription = row['motif_transcription'],
            dim_1 = row['dim_1'],
            dim_2 = row['dim_2'],
            dim_3 = row['dim_3'],
            date_collected = row['date_collected'],
            holder_file_pk = row['holder_file'] or None,
            holder_obj_id = row['holder_obj_id'],
            collector = row['collector'],
            place_collected = row['place_collected'],
            creator_copyright = row['creator_copyright'],
            file_copyright = row['file_copyright'],
            latitude = row['lat'] or None,
            longitude = row['long'] or None,
            notes_coords = row['notes_coords'],
            url_institution = row['url_institution'],
            url_source_1 = row['url_source_1'],
            url_source_2 = row['url_source_2'],
            irn = row['irn'],
            notes = row['notes'],
            data_entry = row['data_entry'],
            fts = fts.tsvector('\n'.join(re.sub('[_\-]','.',v) for v in fts_items)),
        )

    DBSession.flush()
    for row in dicts('sticks'):
        for t in ['sem_domain', 'material', 'source_type', 'technique', 'keywords']:
            if row[t]:
                for _, k in enumerate(row[t].split(';')):
                    data.add(
                        getattr(models, 'x_%s' % (t)),
                        k,
                        object_pk = int(row['pk']),
                        item_pk = int(k),
                    )
Esempio n. 14
0
def main(args):  # pragma: no cover
    license = licenses.find(args.cldf.properties['dc:license'])
    assert license and license.id.startswith('CC-')
    clts = CLTS(
        input('Path to cldf-clts/clts:') or '../../cldf-clts/clts-data')
    data = Data()
    ds = data.add(
        common.Dataset,
        vanuatuvoices.__name__,
        id=vanuatuvoices.__name__,
        name='Vanuatu Voices',
        domain='vanuatuvoices.clld.org',
        contact="*****@*****.**",
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="https://www.eva.mpg.de",
        license=license.url,
        jsondata={
            'license_icon':
            '{}.png'.format('-'.join(
                [p.lower() for p in license.id.split('-')[:-1]])),
            'license_name':
            license.name
        },
    )

    form2audio = audioutil.form2audio(args.cldf, 'audio/mpeg')

    r = get_dataset('vanuatuvoices', ep='lexibank.dataset')
    authors, _ = r.get_creators_and_contributors()
    for ord, author in enumerate(authors):
        cid = slug(HumanName(author['name']).last)
        img = pathlib.Path(
            vanuatuvoices.__file__).parent / 'static' / '{}.jpg'.format(cid)
        c = data.add(
            common.Contributor,
            cid,
            id=cid,
            name=author['name'],
            description=author.get('description'),
            jsondata=dict(img=img.name if img.exists() else None),
        )
    data.add(
        common.Contributor,
        'forkel',
        id='forkel',
        name='Robert Forkel',
        description='Data curation and website implementation',
        jsondata=dict(img=None),
    )
    for ord, cid in enumerate(['walworth', 'forkel', 'gray']):
        DBSession.add(
            common.Editor(ord=ord,
                          dataset=ds,
                          contributor=data['Contributor'][cid]))

    contribs = collections.defaultdict(lambda: collections.defaultdict(list))
    for c in args.cldf.iter_rows('contributions.csv'):
        for role in ['phonetic_transcriptions', 'recording', 'sound_editing']:
            for name in c[role].split(' and '):
                if name:
                    cid = slug(HumanName(name).last)
                    contribs[c['Language_ID']][cid].append(role)

    for lang in args.cldf.iter_rows('LanguageTable', 'id', 'glottocode',
                                    'name', 'latitude', 'longitude'):
        contrib = data.add(
            common.Contribution,
            lang['id'],
            id=lang['id'],
            name='Wordlist for {}'.format(lang['name']),
        )
        if lang['id'] in contribs:
            for cid, roles in contribs[lang['id']].items():
                DBSession.add(
                    common.ContributionContributor(
                        contribution=contrib,
                        contributor=data['Contributor'][cid],
                        jsondata=dict(roles=roles),
                    ))
        data.add(
            models.Variety,
            lang['id'],
            id=lang['id'],
            name=lang['name'],
            latitude=lang['latitude'],
            longitude=lang['longitude'],
            glottocode=lang['glottocode'],
            description=lang['LongName'],
            contribution=contrib,
            island=lang['Island'],
        )

    for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    refs = collections.defaultdict(list)

    for param in args.cldf.iter_rows('ParameterTable', 'id',
                                     'concepticonReference', 'name'):
        data.add(
            models.Concept,
            param['id'],
            id=param['id'],
            name='{} [{}]'.format(param['name'], param['id'].split('_')[0]),
            description=param['Bislama_Gloss'],
            concepticon_id=param['concepticonReference'],
            concepticon_gloss=param['Concepticon_Gloss'],
        )
    inventories = collections.defaultdict(collections.Counter)
    for form in args.cldf.iter_rows('FormTable', 'id', 'form',
                                    'languageReference', 'parameterReference',
                                    'source'):
        inventories[form['languageReference']].update(form['Segments'])
        vsid = (form['languageReference'], form['parameterReference'])
        vs = data['ValueSet'].get(vsid)
        if not vs:
            vs = data.add(
                common.ValueSet,
                vsid,
                id='-'.join(vsid),
                language=data['Variety'][form['languageReference']],
                parameter=data['Concept'][form['parameterReference']],
                contribution=data['Contribution'][form['languageReference']],
            )
        for ref in form.get('source', []):
            sid, pages = Sources.parse(ref)
            refs[(vsid, sid)].append(pages)
        data.add(Counterpart,
                 form['id'],
                 id=form['id'],
                 name=form['form'],
                 valueset=vs,
                 audio=form2audio.get(form['id']))

    for (vsid, sid), pages in refs.items():
        DBSession.add(
            common.ValueSetReference(valueset=data['ValueSet'][vsid],
                                     source=data['Source'][sid],
                                     description='; '.join(nfilter(pages))))

    for lid, inv in inventories.items():
        inv = [clts.bipa[c] for c in inv]
        data['Variety'][lid].update_jsondata(
            inventory=[(str(c), c.name) for c in inv
                       if getattr(c, 'name', None)])
Esempio n. 15
0
def main(args):
    data = Data()
    glottocodes, bibtex_keys = {}, defaultdict(set)
    for d in reader(
            args.data_file('repos', 'mappings',
                           'InventoryID-ISO-gcode-Bibkey-Source.tsv')):
        glottocodes[d['InventoryID']] = d['Glottocode']
        bibtex_keys[d['InventoryID']].add(d['BibtexKey'])

    glottolog = Glottolog(
        Path(phoible.__file__).parent.parent.parent.parent.joinpath(
            'glottolog3', 'glottolog'))
    languoids = {l.id: l for l in glottolog.languoids()}

    phonemes = sorted(list(
        reader(args.data_file('repos', 'data', 'phoible-by-phoneme.tsv'))),
                      key=lambda r: (r['InventoryID'], r['GlyphID']))

    inventories = defaultdict(set)
    for p in phonemes:
        if p['InventoryID'] in glottocodes:
            inventories[(languoids[glottocodes[p['InventoryID']]].name,
                         p['SpecificDialect'], p['Source'].upper())].add(
                             (p['InventoryID'], p['LanguageName']))

    inventory_names = {}
    for (glname, dname, source), invids in inventories.items():
        if len(invids) == 1:
            invid, lname = invids.pop()
            inventory_names[invid] = name_in_source(glname,
                                                    dname) + ' [%s]' % source
        else:
            use_lname = len(set(r[1] for r in invids)) == len(invids)
            for i, (invid,
                    lname) in enumerate(sorted(invids,
                                               key=lambda j: int(j[0]))):
                disambiguation = ' %s' % (i + 1, )
                if use_lname:
                    disambiguation = ' (%s)' % lname
                inventory_names[invid] = name_in_source(
                    glname, dname) + '%s [%s]' % (disambiguation, source)

    for (invid, lname, dname, source), ps in groupby(
            phonemes, lambda p: (p['InventoryID'], p['LanguageName'], p[
                'SpecificDialect'], p['Source'])):
        if invid not in glottocodes:
            continue
        ps = list(ps)
        gc = glottocodes[invid]
        lang = data['Variety'].get(gc)
        if not lang:
            languoid = languoids[gc]
            lang = data.add(
                models.Variety,
                gc,
                id=gc,
                language_code=ps[0]['LanguageCode'],
                name=languoid.name,
                level=text_type(languoid.level.name),
                latitude=languoid.latitude,
                longitude=languoid.longitude,
            )
            if lang.latitude is None and languoid.level == Level.dialect:
                ll = get_language(languoid)
                lang.latitude = ll.latitude
                lang.longitude = ll.longitude

        contrib = data.add(
            models.Inventory,
            invid,
            id=invid,
            #language=lang,
            source=source,
            #source_url=source_urls.get(row.InventoryID),
            #internetarchive_url=ia_urls.get(row.InventoryID),
            name=inventory_names[invid],
            description=name_in_source(lname, dname))

    return

    # FIXME: read from mappings file!
    refs = defaultdict(list)
    for row in get_rows(args, 'BibtexKey'):
        if row[1] == 'NO SOURCE GIVEN':
            refs[row[0]] = []
        else:
            refs[row[0]].append(row[1])
    add_sources(args, data)

    dataset = data.add(
        common.Dataset,
        'phoible',
        id='phoible',
        name='PHOIBLE Online',
        description='PHOIBLE Online',
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="https://www.shh.mpg.de",
        domain='phoible.org',
        license='http://creativecommons.org/licenses/by-sa/3.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon':
            'http://i.creativecommons.org/l/by-sa/3.0/88x31.png',
            'license_name':
            'Creative Commons Attribution-ShareAlike 3.0 Unported License'
        })

    for i, spec in enumerate([
        ('moran', "Steven Moran"),
        ('mccloy', "Daniel McCloy"),
        ('wright', "Richard Wright"),
    ]):
        DBSession.add(
            common.Editor(dataset=dataset,
                          ord=i + 1,
                          contributor=common.Contributor(id=spec[0],
                                                         name=spec[1])))

    #squibs = defaultdict(list)
    #for row in get_rows(args, 'Squib'):
    #    squibs[row[0]].append(row[1])

    source_urls = dict(get_rows(args, 'URL'))
    ia_urls = dict(get_rows(args, 'InternetArchive'))

    # FIXME: group phoible-by-phoneme by LanguageCode, Source (make sure this is unique!)
    aggregated = list(
        reader(args.data_file('phoible-aggregated.tsv'),
               delimiter='\t',
               namedtuples=True))
    inventory_names = {}
    for key, items in groupby(sorted(aggregated,
                                     key=lambda t: (t.LanguageCode, t.Source)),
                              key=lambda t: (t.LanguageCode, t.Source)):
        items = list(items)

        lname = lnames.get(key[0])
        if not lname:
            lname = items[0].LanguageName
            lnames[key[0]] = lname

        if len(items) == 1:
            inventory_names[items[0].InventoryID] = '%s (%s)' % (lname, key[1])
        else:
            for i, item in enumerate(items):
                inventory_names[item.InventoryID] = '%s %s (%s)' % (lname, i +
                                                                    1, key[1])

    # pull in Glottolog families instead? or in addition?

    family_map = {
        ("Arawakan", "arwk"): "Arawakan",
        ("Trans-New Guinea", "trng"): "Trans-New Guinea",
        ("Moklen", "anes"): "Austronesian",
        ("Oko", "ncon"): "Niger-Congo",
        ("Muniche", "saso"): "Muniche",
        ("Tinigua", "saso"): "Tinigua",
        ("Vilela", "luvi"): "Vilela",
        ("Ofayé", "macg"): "Kamakanan",
        ("Purian", "macg"): "PurianPrint",
        ("Mixed language", "saml"): "Mixed language",
        ("Tupian", "tupi"): "Tupian",
        ("Yuwana", "saun"): "YuwanaPrint",
    }
    family_code_map = {k[1]: v for k, v in family_map.items()}

    for row in aggregated:
        lang = data['Variety'].get(row.LanguageCode)
        if not lang:
            if row.LanguageFamilyGenus == 'UNCLASSIFIED':
                genus = None
            else:
                genus_id = slug(strip_quotes(row.LanguageFamilyGenus))
                genus = genera.get(genus_id)
                if not genus:
                    genus = genera.get(row.LanguageCode)
                    if not genus:
                        #print(row.LanguageFamilyGenus, row.LanguageFamilyRoot)
                        family = family_map.get(
                            (row.LanguageFamilyGenus, row.LanguageFamilyRoot))
                        genus = genera[genus_id] = data.add(
                            models.Genus,
                            genus_id,
                            id=genus_id,
                            name=row.LanguageFamilyGenus,
                            description=family or row.LanguageFamilyRoot,
                            active=False,
                            root=row.LanguageFamilyRoot)

                if not genus.root:
                    genus.root = row.LanguageFamilyRoot

                if genus.description in family_code_map:
                    genus.description = family_code_map[genus.description]

            if row.LanguageCode in geocoords:
                coords = geocoords[row.LanguageCode]
            elif row.Latitude != 'NULL' and row.Longitude != 'NULL':
                coords = (float(row.Latitude), float(row.Longitude))
            lang = data.add(models.Variety,
                            row.LanguageCode,
                            id=row.LanguageCode,
                            name=lnames[row.LanguageCode],
                            genus=genus,
                            country=strip_quotes(row.Country),
                            area=strip_quotes(row.Area),
                            latitude=coords[0],
                            longitude=coords[1],
                            jsondata=dict(inventory_id=row.InventoryID))
            add_language_codes(data,
                               lang,
                               row.LanguageCode,
                               glottocodes=glottocodes)

        contributor = data['Contributor'].get(row.Source)
        if not contributor:
            contributor = data.add(common.Contributor,
                                   row.Source,
                                   id=row.Source,
                                   name=SOURCES[row.Source][0],
                                   description=SOURCES[row.Source][2])
            for ref in SOURCES[row.Source][1]:
                DBSession.add(
                    models.ContributorReference(source=data['Source'][ref],
                                                contributor=contributor))

        contrib = data.add(models.Inventory,
                           row.InventoryID,
                           id=row.InventoryID,
                           language=lang,
                           source=row.Source,
                           source_url=source_urls.get(row.InventoryID),
                           internetarchive_url=ia_urls.get(row.InventoryID),
                           name=inventory_names[row.InventoryID],
                           description=row.LanguageName)

        DBSession.add(
            common.ContributionContributor(contribution=contrib,
                                           contributor=contributor))

        #for j, squib in enumerate(squibs.get(row.InventoryID, [])):
        #    f = common.Contribution_files(
        #        object=contrib,
        #        id='squib-%s-%s.pdf' % (contrib.id, j + 1),
        #        name='Phonological squib',
        #        description=squib,
        #        mime_type='application/pdf')
        #    assert f
        #    # f.create(files_dir, file(args.data_file('phonological_squibs', src)).read())

    DBSession.flush()
    unknown_refs = {}

    for row in reader(args.data_file('phoible-phonemes.tsv'),
                      namedtuples=True):
        inventory = data['Inventory'][row.InventoryID]
        segment = data['Segment'].get(row.Phoneme)
        if not segment:
            unicode_desc = [(c, unicodedata.name(c)) for c in row.Phoneme]
            description = ' - '.join([t[1] for t in unicode_desc])
            segment = data.add(
                models.Segment,
                row.Phoneme,
                id=b16encode(md5(description).digest()),
                name=row.Phoneme,
                description=description,
                equivalence_class=''.join([
                    t[0] for t in unicode_desc
                    if t[1].split()[0] not in ['COMBINING', 'MODIFIER']
                ]),
                segment_class=row.Class,
                combined_class=row.CombinedClass)
            DBSession.flush()

        vs = common.ValueSet(id=row.PhonemeID,
                             contribution=inventory,
                             language=inventory.language,
                             parameter=segment)

        for ref in refs.get(row.InventoryID, []):
            if ref not in data['Source']:
                if ref not in unknown_refs:
                    print('-------', ref)
                unknown_refs[ref] = 1
                continue
            DBSession.add(
                common.ValueSetReference(source=data['Source'][ref],
                                         valueset=vs))

        DBSession.add(
            common.Value(
                id=row.PhonemeID,
                name='%s %s' %
                (row.Phoneme, data['Inventory'][row.InventoryID].name),
                valueset=vs))
        DBSession.flush()

    for inventory_id in refs:
        for ref in refs[inventory_id]:
            if ref not in data['Source']:
                continue
            data.add(common.ContributionReference,
                     '%s-%s' % (inventory_id, ref),
                     source=data['Source'][ref],
                     contribution=data['Inventory'][inventory_id])

    for i, row in enumerate(
            reader(args.data_file('phoible-segments-features.tsv'))):
        if i == 0:
            features = list(map(feature_name, row))
            continue

        if row[0] not in data['Segment']:
            # print('skipping feature vector:', row)
            continue
        for j, value in enumerate(row):
            if j and value != '0':
                DBSession.add(
                    common.Parameter_data(
                        key=features[j],
                        value=value,
                        ord=j,
                        object_pk=data['Segment'][row[0]].pk))

    # FIXME: add allophones!

    DBSession.flush()
Esempio n. 16
0
def main(args):  # pragma: no cover
    glottocodes = {}
    for row in GC.execute(
            'select ll.hid, l.id from language as l, languoid as ll where ll.pk = l.pk'
    ):
        if row[0] and len(row[0]) == 3:
            glottocodes[row[0]] = row[1]

    icons = issues.Icons()
    old_db = DB

    vs2008 = get_vs2008(args)

    missing_sources = []
    refdb_ids = {}
    max_id = 7350
    with open('/home/robert/venvs/clld/data/wals-data/missing_source.py',
              'w') as fp:
        for row in old_db.execute("select * from reference"):
            try:
                author, year = row['id'].split('-')
            except:
                author, year = None, None
            bibdata = get_source(row['id'])
            if not bibdata:
                fp.write('"%s",\n' % row['id'])
                missing_sources.append(row['id'])
                bibdata['pk'] = max_id
                max_id += 1

            if bibdata['pk'] in refdb_ids:
                print('already seen:', row['id'], 'as',
                      refdb_ids[bibdata['pk']])
                data['Source'][row['id']] = data['Source'][refdb_ids[
                    bibdata['pk']]]
                continue
            refdb_ids[bibdata['pk']] = row['id']

            bibdata.update({
                'id':
                row['id'],
                'name':
                row['name'],
                'description':
                bibdata.get('title', bibdata.get('booktitle')),
                'google_book_search_id':
                row['gbs_id'] or None,
            })
            data.add(common.Source, row['id'], **bibdata)

        #
        # TODO: add additional bibdata as data items
        #

    print('sources missing for %s refs' % len(missing_sources))

    for id, name in ABBRS.items():
        DBSession.add(common.GlossAbbreviation(id=id, name=name))

    migrate(
        'country', models.Country, lambda r:
        (r['id'], dict(id=r['id'], name=r['name'], continent=r['continent'])))

    migrate(
        'family', models.Family, lambda r:
        (r['id'], dict(id=r['id'], name=r['name'], description=r['comment'])))

    for row, icon in zip(
            list(old_db.execute("select * from genus order by family_id")),
            cycle(iter(icons))):
        genus = data.add(models.Genus,
                         row['id'],
                         id=row['id'],
                         name=row['name'],
                         icon=icon,
                         subfamily=row['subfamily'])
        genus.family = data['Family'][row['family_id']]
    DBSession.flush()

    migrate(
        'altname', common.Identifier, lambda r:
        ((r['name'], r['type']),
         dict(name=r['name'], type='name', description=r['type'])))

    # names for isolanguages are not unique!
    enames = {}
    for r in DB.execute("select * from isolanguage"):
        id_ = 'ethnologue-%s' % r['id']
        if r['name'] in enames:
            data['Identifier'][id_] = enames[r['name']]
        else:
            enames[r['name']] = data.add(common.Identifier,
                                         id_,
                                         id=id_,
                                         name=r['name'],
                                         type='name',
                                         description='ethnologue')
    DBSession.flush()

    migrate(
        'isolanguage', common.Identifier, lambda r:
        (r['id'],
         dict(id=r['id'],
              name=r['id'],
              type=common.IdentifierType.iso.value,
              description=r['name'])))

    migrate(
        'isolanguage', common.Identifier, lambda r: None
        if r['id'] not in glottocodes else
        ('gc-%s' % r['id'],
         dict(id='gc-%s' % r['id'],
              name=glottocodes[r['id']],
              type=common.IdentifierType.glottolog.value,
              description=r['name'])))

    migrate(
        'language', models.WalsLanguage, lambda r:
        (r['id'],
         dict(id=r['id'],
              name=r['name'],
              latitude=r['latitude'],
              longitude=r['longitude'],
              ascii_name=r['ascii_name'],
              genus=data['Genus'][r['genus_id']],
              samples_100=r['samples_100'] != 0,
              samples_200=r['samples_200'] != 0)))

    migrate(
        'author', common.Contributor, lambda r:
        (r['id'],
         dict(name=r['name'], url=r['www'], id=r['id'], description=r['note'])
         ))

    dataset = common.Dataset(
        id='wals',
        name='WALS Online',
        description='The World Atlas of Language Structures Online',
        domain='wals.info',
        published=date(2013, 8, 15),
        contact='*****@*****.**',
        license='http://creativecommons.org/licenses/by-nc-nd/2.0/de/deed.en',
        jsondata={
            'license_icon':
            'http://wals.info/static/images/cc_by_nc_nd.png',
            'license_name':
            'Creative Commons Attribution-NonCommercial-NoDerivs 2.0 Germany'
        })
    DBSession.add(dataset)

    for i, editor in enumerate(['dryerms', 'haspelmathm']):
        common.Editor(dataset=dataset,
                      contributor=data['Contributor'][editor],
                      ord=i + 1)

    migrate(
        'country_language', models.CountryLanguage,
        lambda r: dict(language_pk=data['WalsLanguage'][r['language_id']].pk,
                       country_pk=data['Country'][r['country_id']].pk))

    migrate(
        'altname_language', common.LanguageIdentifier,
        lambda r: dict(language=data['WalsLanguage'][r['language_id']],
                       identifier=data['Identifier'][
                           (r['altname_name'], r['altname_type'])],
                       description=r['relation']))

    migrate(
        'isolanguage_language', common.LanguageIdentifier,
        lambda r: dict(language=data['WalsLanguage'][r['language_id']],
                       identifier=data['Identifier'][r['isolanguage_id']],
                       description=r['relation']))

    migrate(
        'isolanguage_language', common.LanguageIdentifier, lambda r: None
        if 'ethnologue-%s' % r['isolanguage_id'] not in data['Identifier'] else
        dict(language=data['WalsLanguage'][r['language_id']],
             identifier=data['Identifier']['ethnologue-%s' % r['isolanguage_id'
                                                               ]],
             description=r['relation']))

    migrate(
        'isolanguage_language', common.LanguageIdentifier, lambda r: None
        if 'gc-%s' % r['isolanguage_id'] not in data['Identifier'] else dict(
            language=data['WalsLanguage'][r['language_id']],
            identifier=data['Identifier']['gc-%s' % r['isolanguage_id']],
            description=r['relation']))

    migrate(
        'area', models.Area, lambda r:
        (r['id'],
         dict(name=r['name'], dbpedia_url=r['dbpedia_url'], id=str(r['id']))))

    def migrate_chapter(row):
        kw = dict(id=row['id'],
                  name=row['name'],
                  wp_slug=row['blog_title'],
                  sortkey=int(row['id']),
                  area=data['Area'][row['area_id']])
        if int(row['id']) in [143, 144]:
            kw['created'] = E2011
            kw['updated'] = E2011
        return row['id'], kw

    migrate('chapter', models.Chapter, migrate_chapter)

    def migrate_supplement(row):
        if row['name'] not in ['Help', 'Abbreviations']:
            sortkey = 990 + int(
                row['id']) if row['name'] != 'Introduction' else 0
            id_ = 's%s' % row['id']
            kw = dict(id=id_, name=row['name'], sortkey=sortkey)
            return id_, kw

    migrate('supplement', models.Chapter, migrate_supplement)

    migrate(
        'chapter_reference', common.ContributionReference,
        lambda r: dict(contribution=data['Chapter'][r['chapter_id']],
                       source=data['Source'][r['reference_id']]))

    migrate(
        'reference_supplement', common.ContributionReference, lambda r: dict(
            contribution=data['Chapter']['s%s' % r['supplement_id']],
            source=data['Source'][r['reference_id']]))

    def migrate_feature(row):
        kw = dict(id=row['id'],
                  name=row['name'],
                  ordinal_qualifier=row['id'][-1])
        if row['id'].startswith('143') or row['id'].startswith('144'):
            kw['created'] = E2011
            kw['updated'] = E2011
        kw['chapter'] = data['Chapter'][row['chapter_id']]
        return row['id'], kw

    migrate('feature', models.Feature, migrate_feature)

    def migrate_value(row):
        desc = row['description']
        if desc == 'SOV & NegV/VNeg':
            if row['icon_id'] != 's9ff':
                desc += ' (a)'
            else:
                desc += ' (b)'
        kw = dict(id='%s-%s' % (row['feature_id'], row['numeric']),
                  name=desc,
                  description=row['long_description'],
                  jsondata=dict(icon=issues.Icons.id(row['icon_id'])),
                  number=row['numeric'],
                  parameter=data['Feature'][row['feature_id']])
        return (row['feature_id'], row['numeric']), kw

    migrate('value', common.DomainElement, migrate_value)

    same = 0
    added = 0
    for row in old_db.execute("select * from datapoint"):
        parameter = data['Feature'][row['feature_id']]
        language = data['WalsLanguage'][row['language_id']]
        id_ = '%s-%s' % (parameter.id, language.id)
        created = E2008
        updated = E2008

        value_numeric = row['value_numeric']
        if (language.id, parameter.id) in vs2008:
            if vs2008[(language.id, parameter.id)] != row['value_numeric']:
                print('~~~', id_, vs2008[(language.id, parameter.id)], '-->',
                      row['value_numeric'])
                value_numeric = vs2008[(language.id, parameter.id)]
            else:
                same += 1
        else:
            updated = E2011
            created = E2011
            if parameter.id[-1] == 'A' and not (
                    parameter.id.startswith('143')
                    or parameter.id.startswith('144')):
                added += 1

        kw = dict(id=id_, updated=updated, created=created)
        valueset = data.add(common.ValueSet,
                            row['id'],
                            language=language,
                            parameter=parameter,
                            contribution=parameter.chapter,
                            **kw)
        data.add(common.Value,
                 id_,
                 domainelement=data['DomainElement'][(row['feature_id'],
                                                      value_numeric)],
                 valueset=valueset,
                 **kw)

    print(same, 'datapoints did not change')
    print(added, 'datapoints added to existing features')

    DBSession.flush()

    migrate(
        'datapoint_reference', common.ValueSetReference,
        lambda r: dict(valueset=data['ValueSet'][r['datapoint_id']],
                       source=data['Source'][r['reference_id']],
                       description=r['note']))

    migrate(
        'author_chapter', common.ContributionContributor,
        lambda r: dict(ord=r['order'],
                       primary=r['primary'] != 0,
                       contributor_pk=data['Contributor'][r['author_id']].pk,
                       contribution_pk=data['Chapter'][r['chapter_id']].pk))

    migrate(
        'author_supplement', common.ContributionContributor, lambda r: dict(
            ord=r['order'],
            primary=r['primary'] != 0,
            contributor_pk=data['Contributor'][r['author_id']].pk,
            contribution_pk=data['Chapter']['s%s' % r['supplement_id']].pk))

    igts = defaultdict(lambda: [])
    for row in old_db.execute("select * from igt"):
        d = {'id': 'igt-%s' % row['id']}
        d.update(parse_igt(row['xhtml']))
        igts[row['example_id']].append(d)

    for row in old_db.execute("select * from example"):
        if not row['language_id']:
            print('example without language:', row['id'])
            continue

        _igts = igts[row['id']]
        if _igts:
            for igt in _igts:
                data.add(common.Sentence,
                         igt['id'],
                         markup_comment=row['xhtml'],
                         language=data['WalsLanguage'][row['language_id']],
                         **igt)
        else:
            name = teaser(row['xhtml'])
            if name:
                data.add(common.Sentence,
                         row['id'],
                         id=str(row['id']),
                         name=name,
                         xhtml=row['xhtml'],
                         language=data['WalsLanguage'][row['language_id']])

    missing = {}
    for row in old_db.execute("select * from example_feature"):
        _igts = igts[row['example_id']]
        if _igts:
            for igt in _igts:
                try:
                    sentence = data['Sentence'][igt['id']]
                except KeyError:
                    print('missing sentence:', row['example_id'])
                    continue
                try:
                    value = data['Value']['%s-%s' % (row['feature_id'],
                                                     sentence.language.id)]
                    DBSession.add(
                        common.ValueSentence(sentence=sentence, value=value))
                except KeyError:
                    missing[(row['feature_id'], sentence.language.id)] = 1
        else:
            try:
                sentence = data['Sentence'][row['example_id']]
            except KeyError:
                print('missing sentence:', row['example_id'])
                continue
            try:
                value = data['Value']['%s-%s' % (row['feature_id'],
                                                 sentence.language.id)]
                DBSession.add(
                    common.ValueSentence(sentence=sentence, value=value))
            except KeyError:
                missing[(row['feature_id'], sentence.language.id)] = 1

    print(len(missing), 'missing datapoints for example_feature relations')
Esempio n. 17
0
def main(args):
    # determine if we run on a machine where other databases are available for lookup
    # locally:
    data = Data()
    genera = get_genera(data) if astroman else {}
    glottocodes, lnames, geocoords = {}, {}, {}
    if astroman:
        for k, v in glottocodes_by_isocode(
                'postgresql://robert@/glottolog3',
                cols=['id', 'name', 'latitude', 'longitude']).items():
            glottocodes[k] = v[0]
            lnames[k] = v[1]
            geocoords[k] = (v[2], v[3])

    refs = defaultdict(list)
    for row in get_rows(args, 'BibtexKey'):
        if row[1] == 'NO SOURCE GIVEN':
            refs[row[0]] = []
        else:
            refs[row[0]].append(row[1])
    add_sources(args, data)

    dataset = data.add(
        common.Dataset,
        'phoible',
        id='phoible',
        name='PHOIBLE Online',
        description='PHOIBLE Online',
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="http://www.eva.mpg.de",
        domain='phoible.org',
        license='http://creativecommons.org/licenses/by-sa/3.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon':
            'http://i.creativecommons.org/l/by-sa/3.0/88x31.png',
            'license_name':
            'Creative Commons Attribution-ShareAlike 3.0 Unported License'
        })

    for i, spec in enumerate([
        ('moran', "Steven Moran"),
        ('mccloy', "Daniel McCloy"),
        ('wright', "Richard Wright"),
    ]):
        DBSession.add(
            common.Editor(dataset=dataset,
                          ord=i + 1,
                          contributor=common.Contributor(id=spec[0],
                                                         name=spec[1])))

    squibs = defaultdict(list)
    for row in get_rows(args, 'Squib'):
        squibs[row[0]].append(row[1])

    source_urls = dict(get_rows(args, 'URL'))
    ia_urls = dict(get_rows(args, 'InternetArchive'))

    aggregated = list(
        reader(args.data_file('phoible-aggregated.tsv'), namedtuples=True))
    inventory_names = {}
    for key, items in groupby(sorted(aggregated,
                                     key=lambda t: (t.LanguageCode, t.Source)),
                              key=lambda t: (t.LanguageCode, t.Source)):
        items = list(items)

        lname = lnames.get(key[0])
        if not lname:
            lname = items[0].LanguageName
            lnames[key[0]] = lname

        if len(items) == 1:
            inventory_names[items[0].InventoryID] = '%s (%s)' % (lname, key[1])
        else:
            for i, item in enumerate(items):
                inventory_names[item.InventoryID] = '%s %s (%s)' % (lname, i +
                                                                    1, key[1])

    family_map = {
        ("Arawakan", "arwk"): "Arawakan",
        ("Trans-New Guinea", "trng"): "Trans-New Guinea",
        ("Moklen", "anes"): "Austronesian",
        ("Oko", "ncon"): "Niger-Congo",
        ("Muniche", "saso"): "Muniche",
        ("Tinigua", "saso"): "Tinigua",
        ("Vilela", "luvi"): "Vilela",
        ("Ofayé", "macg"): "Kamakanan",
        ("Purian", "macg"): "PurianPrint",
        ("Mixed language", "saml"): "Mixed language",
        ("Tupian", "tupi"): "Tupian",
        ("Yuwana", "saun"): "YuwanaPrint",
    }
    family_code_map = {k[1]: v for k, v in family_map.items()}

    for row in aggregated:
        lang = data['Variety'].get(row.LanguageCode)
        if not lang:
            if row.LanguageFamilyGenus == 'UNCLASSIFIED':
                genus = None
            else:
                genus_id = slug(strip_quotes(row.LanguageFamilyGenus))
                genus = genera.get(genus_id)
                if not genus:
                    genus = genera.get(row.LanguageCode)
                    if not genus:
                        #print(row.LanguageFamilyGenus, row.LanguageFamilyRoot)
                        family = family_map.get(
                            (row.LanguageFamilyGenus, row.LanguageFamilyRoot))
                        genus = genera[genus_id] = data.add(
                            models.Genus,
                            genus_id,
                            id=genus_id,
                            name=row.LanguageFamilyGenus,
                            description=family or row.LanguageFamilyRoot,
                            active=False,
                            root=row.LanguageFamilyRoot)

                if not genus.root:
                    genus.root = row.LanguageFamilyRoot

                if genus.description in family_code_map:
                    genus.description = family_code_map[genus.description]

            if row.LanguageCode in geocoords:
                coords = geocoords[row.LanguageCode]
            elif row.Latitude != 'NULL' and row.Longitude != 'NULL':
                coords = (float(row.Latitude), float(row.Longitude))
            lang = data.add(models.Variety,
                            row.LanguageCode,
                            id=row.LanguageCode,
                            name=lnames[row.LanguageCode],
                            genus=genus,
                            country=strip_quotes(row.Country),
                            area=strip_quotes(row.Area),
                            latitude=coords[0],
                            longitude=coords[1],
                            jsondata=dict(inventory_id=row.InventoryID))
            add_language_codes(data,
                               lang,
                               row.LanguageCode,
                               glottocodes=glottocodes)

        contributor = data['Contributor'].get(row.Source)
        if not contributor:
            contributor = data.add(common.Contributor,
                                   row.Source,
                                   id=row.Source,
                                   name=SOURCES[row.Source][0],
                                   description=SOURCES[row.Source][2])
            for ref in SOURCES[row.Source][1]:
                DBSession.add(
                    models.ContributorReference(source=data['Source'][ref],
                                                contributor=contributor))

        contrib = data.add(models.Inventory,
                           row.InventoryID,
                           id=row.InventoryID,
                           language=lang,
                           source=row.Source,
                           source_url=source_urls.get(row.InventoryID),
                           internetarchive_url=ia_urls.get(row.InventoryID),
                           name=inventory_names[row.InventoryID],
                           description=row.LanguageName)

        DBSession.add(
            common.ContributionContributor(contribution=contrib,
                                           contributor=contributor))

        for j, squib in enumerate(squibs.get(row.InventoryID, [])):
            f = common.Contribution_files(object=contrib,
                                          id='squib-%s-%s.pdf' %
                                          (contrib.id, j + 1),
                                          name='Phonological squib',
                                          description=squib,
                                          mime_type='application/pdf')
            assert f
            # f.create(files_dir, file(args.data_file('phonological_squibs', src)).read())

    DBSession.flush()
    unknown_refs = {}

    for row in reader(args.data_file('phoible-phonemes.tsv'),
                      namedtuples=True):
        inventory = data['Inventory'][row.InventoryID]
        segment = data['Segment'].get(row.Phoneme)
        if not segment:
            unicode_desc = [(c, unicodedata.name(c)) for c in row.Phoneme]
            description = ' - '.join([t[1] for t in unicode_desc])
            segment = data.add(
                models.Segment,
                row.Phoneme,
                id=b16encode(md5(description).digest()),
                name=row.Phoneme,
                description=description,
                equivalence_class=''.join([
                    t[0] for t in unicode_desc
                    if t[1].split()[0] not in ['COMBINING', 'MODIFIER']
                ]),
                segment_class=row.Class,
                combined_class=row.CombinedClass)
            DBSession.flush()

        vs = common.ValueSet(id=row.PhonemeID,
                             contribution=inventory,
                             language=inventory.language,
                             parameter=segment)

        for ref in refs.get(row.InventoryID, []):
            if ref not in data['Source']:
                if ref not in unknown_refs:
                    print('-------', ref)
                unknown_refs[ref] = 1
                continue
            DBSession.add(
                common.ValueSetReference(source=data['Source'][ref],
                                         valueset=vs))

        DBSession.add(
            common.Value(
                id=row.PhonemeID,
                name='%s %s' %
                (row.Phoneme, data['Inventory'][row.InventoryID].name),
                valueset=vs))
        DBSession.flush()

    for inventory_id in refs:
        for ref in refs[inventory_id]:
            if ref not in data['Source']:
                continue
            data.add(common.ContributionReference,
                     '%s-%s' % (inventory_id, ref),
                     source=data['Source'][ref],
                     contribution=data['Inventory'][inventory_id])

    for i, row in enumerate(
            reader(args.data_file('phoible-segments-features.tsv'))):
        if i == 0:
            features = list(map(feature_name, row))
            continue

        if row[0] not in data['Segment']:
            # print('skipping feature vector:', row)
            continue
        for j, value in enumerate(row):
            if j and value != '0':
                DBSession.add(
                    common.Parameter_data(
                        key=features[j],
                        value=value,
                        ord=j,
                        object_pk=data['Segment'][row[0]].pk))
    DBSession.flush()
Esempio n. 18
0
def main(args):  # pragma: no cover
    data = Data()

    print("Setting up dataset…")
    dataset = common.Dataset(
        id=cariban.__name__,
        domain="cariban.clld.org",
        name="Comparative Cariban Database",
        description="Comparative Cariban Database",
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_url="https://www.eva.mpg.de",
        publisher_place="Leipzig",
        license="https://creativecommons.org/licenses/by/4.0/",
        contact="*****@*****.**",
        jsondata={'function_paradigms': []},
    )

    fps = []
    for morph_func in args.cldf["ValueTable"]:
        for function in morph_func["Function"]:
            for cons in morph_func["Construction"]:
                fps.append({
                    'Function': function,
                    'Construction': cons,
                    'Morpheme': morph_func['Morpheme']})
    dataset.update_jsondata(function_paradigms=fps)

    DBSession.add(dataset)
    DBSession.flush()

    print("Adding contributors…")
    c = common.Contributor(id="fm",name="Florian Matter", email="*****@*****.**", url="https://florianmatter.gitlab.io/")
    dataset.editors.append(common.Editor(contributor=c, ord=1, primary=True))

    print("Adding languages…")
    dialect_mapping = {}
    lang_shorthands = {}
    glottocodes = {}
    lang_ids = {}
    for lang in args.cldf["LanguageTable"]:
        if lang["Sampled"] == "y":
            language = data.add(
                common.Language,
                lang["ID"],
                id=lang["ID"],
                name=lang["Name"],
                latitude=float(lang["Latitude"]) if lang["Latitude"] is not None else None,
                longitude=float(lang["Longitude"]) if lang["Longitude"] is not None else None,
                jsondata={'Shorthand': lang['Shorthand'], 'Glottocode': lang['Glottocode']},
            )
            add_language_codes(data, language, isocode=lang["ISO"], glottocode = lang["Glottocode"])
        if lang["Dialect_Of"] not in [None, "y"]:
            dialect_mapping[lang["ID"]] = lang["Dialect_Of"]
        lang_shorthands[lang["Shorthand"]] = {"ID": lang["ID"], "Name": lang["Name"]}
        glottocodes[lang["Glottocode"]] = {"ID": lang["ID"], "Name": lang["Name"], "Shorthand": lang["Shorthand"]}
        lang_ids[lang["ID"]] = {"Glottocode": lang["Glottocode"], "Name": lang["Name"], "Shorthand": lang["Shorthand"]}

    def get_lang_id(key):
        if key in lang_shorthands:
            lang_id = lang_shorthands[key]["ID"]
        elif key in glottocodes:
            lang_id = glottocodes[key]["ID"]
        elif key in lang_ids:
            lang_id = key
        else:
            print("Could not identify language %s" % key)
            return None
        if lang_id in dialect_mapping:
            lang_id = dialect_mapping[lang_id]
        return lang_id

    def get_key_and_page(source_string):
        if len(source_string.split("[")) > 1:
            bib_key = source_string.split("[")[0]
            pages = source_string.split("[")[1].split("]")[0]
        else:
            bib_key = source_string
            pages = ""
        return bib_key, pages

    print("Adding sources…")
    for rec in bibtex.Database.from_file(args.cldf.bibpath):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))
    
    print("Adding language sources…")
    DBSession.flush()
    for rec in bibtex.Database.from_file(args.cldf.bibpath):
        if "keywords" in rec:
            for keyword in rec["keywords"].split(","):
                if keyword in lang_shorthands:
                    lang_id = get_lang_id(keyword.strip(" "))
                    if lang_id in data["Language"]:
                        data.add(common.LanguageSource,
                        rec.id+lang_id,
                        language_pk=data["Language"][lang_id].pk,
                        source_pk=data["Source"][rec.id].pk
                        )
        
    data.add(
        common.Source,
        "pc",
        id="pc",
        name="Personal communication",
        description="Placeholder for data obtained from personal communication.",
        bibtex_type=bibtex.EntryType.misc
    )

#     print("Adding glossing abbreviations…")
#     length = len(pynterlinear.get_all_abbrevs().keys())
#     for i, (key, name) in enumerate(pynterlinear.get_all_abbrevs().items()):
#         print("%s/%s" % (i+1, length), end="\r")
#         DBSession.add(common.GlossAbbreviation(id=key, name=name))
#     print("")
#
    print("Adding examples…")
    gloss_replacements = {
        "S_A_": "Sa",
        "S_P_": "Sp"
    }
    def clldify_glosses(gloss_line):
        for orig, new in gloss_replacements.items():
            gloss_line = gloss_line.replace(orig,new)
        gloss_line = re.sub(r"(\d)([A-Z])", r"\1.\2", gloss_line)
        return gloss_line

    for ex in args.cldf["ExampleTable"]:
        lang_id = get_lang_id(ex["Language_ID"])
        new_ex = data.add(common.Sentence,
            ex["ID"],
            id=ex["ID"],
            name=ex["Name"],
            description=ex["Translated_Text"],
            analyzed="\t".join(ex["Analyzed_Word"]),
            gloss=clldify_glosses("\t".join(ex["Gloss"])),
            language=data["Language"][lang_id],
            comment=ex["Comment"],
            markup_gloss="\t".join(ex["Morpheme_IDs"])
        )
        
        if ex["Source"]:
            bib_key, pages = get_key_and_page(ex["Source"])
            if bib_key in data["Source"]:
                source = data["Source"][bib_key]
                DBSession.add(common.SentenceReference(
                    sentence=new_ex,
                    source=source,
                    key=source.id,
                    description=pages.replace("--","–"))
                )

    def add_morpheme_reference(morpheme, source_string):
        bib_key, pages = get_key_and_page(source_string)
        if bib_key in data["Source"]:
            source = data["Source"][bib_key]
            DBSession.add(models.MorphemeReference(
                morpheme=morpheme,
                source=source,
                key=source.id,
                description=pages.replace("--","–")
                )
            )

    print("Adding morphemes…")
    for morph in args.cldf["FormTable"]:
        lang_id = get_lang_id(morph["Language_ID"])
        form = util.merge_allomorphs("; ".join(morph["Form"])).split("; ")
        new_morph = data.add(models.Morpheme,
            morph["ID"],
            morpheme_type="grammatical",
            language=data["Language"][lang_id],
            name="/".join(form),
            id=morph["ID"],
        )
        
        if morph["Source"]: add_morpheme_reference(new_morph, morph["Source"][0])

    print("Adding constructions…")
    data.add(models.DeclarativeType, "imp", id="imp", name="imperative")
    data.add(models.DeclarativeType, "decl", id="decl", name="declarative")
    data.add(models.MainClauseVerb, "y", id="y", name="main clause construction")
    data.add(models.MainClauseVerb, "n", id="n", name="subordinate clause construction")

    for cons in args.cldf["ParameterTable"]:
        lang_id = get_lang_id(cons["Language_ID"])
        new_construction = data.add(
            models.Construction,
            cons["ID"],
            id=cons["ID"],
            language=data["Language"][lang_id],
            name=cons["Description"],
            mainclauseverb=data["MainClauseVerb"][cons["MainClauseVerb"]],
        )
        if cons["DeclarativeType"]: new_construction.declarativetype = data["DeclarativeType"][cons["DeclarativeType"]]

    def add_morph_func(morpheme, func_key, construction):
        data.add(models.MorphemeFunction,
            "%s:%s" % (morpheme, function),
            id="%s:%s" % (morpheme, func_key),
            name="MorphemeFunction %s:%s"% (morpheme, func_key),
            unit=data["Morpheme"][morpheme],
            unitparameter=data["Meaning"][function],
            construction=construction
        )

    print("Adding morpheme functions…")
    for morph_func in args.cldf["ValueTable"]:
        for function in morph_func["Function"]:
            func_key = function.replace(".","_")
            if ">" in function or function == "LK" or bool(re.search(r"\d[SP]$", function) or function == "3"):
                meaning_type="inflectional"
            else:
                meaning_type="derivational"
            if function not in data["Meaning"]:
                data.add(models.Meaning,
                    function,
                    id=func_key,
                    name=function,
                    meaning_type=meaning_type
                )
            #Only some morpheme functions are specified as occurring in specific constructions
            if len(morph_func["Construction"]) == 0:
                for morpheme in morph_func["Morpheme"]:
                    add_morph_func(morpheme, func_key, None)
            else:
                for construction in morph_func["Construction"]:
                    if len(morph_func["Morpheme"]) == 1 and morph_func["Morpheme"][0] != "?":
                        for morpheme in morph_func["Morpheme"]:
                            if data["Morpheme"][morpheme].language != data["Construction"][construction].language:
                                print("Warning: the %s Morpheme %s is stated to occur in the %s construction %s!" % (
                                data["Morpheme"][morpheme].language,
                                data["Morpheme"][morpheme],
                                data["Construction"][construction].language,
                                data["Construction"][construction]
                                )
                                )
                            cons_func_key = func_key + ":" + construction
                            add_morph_func(morpheme, cons_func_key, data["Construction"][construction])

    print("Checking examples for illustrated morphemes…")
    proto_languages = ["pc"]
    is_illustrated = {}
    for key, row in data["MorphemeFunction"].items():
        if row.unit.language.id in proto_languages: continue
        is_illustrated["%s:%s" % (row.unit.id, row.unitparameter.id)] = False
    for row in args.cldf["ExampleTable"]:
        for word in row["Morpheme_IDs"]:
            morph_ids = util.split_word(word)
            for unit_value in morph_ids:
                if unit_value in ["X","-","=", "~"]:
                    continue
                unitvaluesentence_key = "{0}-{1}".format(unit_value.replace(".","-"),row["ID"])
                if unitvaluesentence_key in data["UnitValueSentence"].keys():
                    continue
                is_illustrated[unit_value] = True
                morph_id = unit_value.split(":")[0]
                if morph_id not in data["Morpheme"].keys():
                    print("Warning: Example %s illustrates unknown morpheme %s" % (row["ID"], morph_id))
                elif data["Morpheme"][morph_id].language != data["Sentence"][row["ID"]].language:
                    print("Warning: The %s example %s claims to contain the %s morpheme %s." % (
                        data["Sentence"][row["ID"]].language,
                        row["ID"],
                        data["Morpheme"][morph_id].language,
                        data["Morpheme"][morph_id]
                    )
                    )
                if ":" not in unit_value:
                    print("%s in %s contains no defined function!" % (unit_value, row["ID"]))
                function = unit_value.split(":")[1]
                morph_function_id = "%s:%s" % (morph_id, function)
                if morph_function_id not in data["MorphemeFunction"].keys():
                    print("Warning: Example %s tries to illustrate inexistent morpheme function %s!" % (row["ID"], unit_value.replace(".","-")))
                    continue
                data.add(models.UnitValueSentence,
                unitvaluesentence_key,
                sentence=data["Sentence"][row["ID"]],
                unitvalue=data["MorphemeFunction"][morph_function_id],
                )


    # see how many morpheme functions are illustrated with example sentences
    good_ill = [key for key, value in is_illustrated.items() if value]
    not_ill = [key for key, value in is_illustrated.items() if not value]
    not_ill.sort()
    cov = len(good_ill)/len(is_illustrated)*100
    print("Morpheme exemplification coverage is at %s%%. List of unillustrated morphemes saved to unillustrated_morphemes.txt" % str(round(cov, 2)))
    f = open("../unillustrated_morphemes.txt", "w")
    for morph in not_ill:
        f.write(morph+"\n")
    f.close()

    print("Adding cognate sets…")
    for cogset in args.cldf["CognatesetTable"]:
        new_cset = data.add(models.Cognateset,
            cogset["ID"],
            id=cogset["ID"],
            name=cogset["Name"],
            description=cogset["Function"],
            cogset_type="grammatical"
        )
        if cogset["Source"]:
            for source in cogset["Source"]:
                bib_key, pages = get_key_and_page(source)
                if bib_key in data["Source"]:
                    source = data["Source"][bib_key]
                    DBSession.add(models.CognatesetReference(
                        cognateset=new_cset,
                        source=source,
                        key=source.id,
                        description=pages)
                        )

    print("Adding cognates…")
    for morph in args.cldf["FormTable"]:
        for cognate_ID in morph["Cognateset_ID"]:
            DBSession.add(models.Cognate(
                    cognateset=data["Cognateset"][cognate_ID],
                    counterpart=data["Morpheme"][morph["ID"]]
                    )
            )

    print("Adding morpheme comments…")
    for row in args.cldf["FormTable"]:
        data["Morpheme"][row["ID"]].markup_description=util.generate_markup(row["Comment"])

    print("Adding construction descriptions…")
    for cons in args.cldf["ParameterTable"]:
        if cons["Comment"] is None:
            description = ""
        else:
            description = util.generate_markup(cons["Comment"])
        description += "\n" + util.generate_markup(util.transitive_construction_paradigm(cons["ID"]))
        description += util.generate_markup(util.intransitive_construction_paradigm(cons["ID"]))
        data["Construction"][cons["ID"]].markup_description = description


    print("Adding cognate set descriptions…")
    for cogset in args.cldf["CognatesetTable"]:
        data["Cognateset"][cogset["ID"]].markup_description = util.generate_markup(cogset["Description"])
        # if cogset["ID"] == "13pro":
        #     data["Cognateset"][cogset["ID"]].markup_description += util.generate_markup(
        #         util.comparative_function_paradigm(
        #             ["apa_main", "tri_main", "way_main", "mak_main", "kar_main", "hix_main", "wai_main", "ara_main", "ikp_main", "wmr_main", "pan_old", "kax_main"],
        #             "1+3 scenarios",
        #             ["1+3S", "1+3>3", "3>1+3", "2>1+3", "1+3>2"]))

    
    def add_tree_labels(phylo):
        uncertain_nodes = []
        for node in phylo.find_clades():
            if node.name == None or not node.is_terminal():
                continue
            plain_name = node.name.replace("?","")
            if "?" in node.name: uncertain_nodes.append(plain_name)
            if plain_name in lang_ids:
                node.name = lang_ids[plain_name]["Name"].replace("'", "’")
            if plain_name in uncertain_nodes: node.name += "?"
        return phylo, uncertain_nodes
        
    print("Adding trees…")
    own_trees = ["matter"]
    tree_path = str(args.cldf.tablegroup._fname.parent / '..' / 'raw')
    newick_files = {}
    for tree in args.cldf["cariban_trees.csv"]:
        if tree["ID"] in own_trees: continue
        newick_files[tree["ID"]] = {
            "orig": tree["ID"]+"_orig.newick",
            "norm": tree["ID"]+"_norm.newick",
            "source": tree["Source"],
            "comment": tree["Comment"],
            "o_comment": tree["Orig_Comment"]
        }
    #adding my own trees separately.
    for my_tree_count, tree_id in enumerate(own_trees):
        my_tree = Phylo.read(tree_path+"/"+"%s.newick" % tree_id, "newick")
        my_tree, uncertain_nodes = add_tree_labels(my_tree)
        
        edited_tree = io.StringIO()
        Phylo.write(my_tree, edited_tree, "newick")
        tree = edited_tree.getvalue().replace(":0.00000","")
        
        my_phylo = Phylogeny(
                tree_id,
                id=tree_id,
                name="Matter (2020)",# % str(my_tree_count+1),
                newick=tree,
                markup_description="My own, conservative, classification."
        )
        
        for l in DBSession.query(common.Language):
            lname = l.name.replace("'", "’")
            if l.id in uncertain_nodes: lname += "?"
            new_label = LanguageTreeLabel(
                language=l,
                treelabel=TreeLabel(
                    id="%s_%s" % (tree_id, l.id),
                    name=lname,
                    phylogeny=my_phylo
                )
            )
              
        DBSession.add(my_phylo)
        
    #adding the other trees
    for tree_id, values in newick_files.items():
        norm_biotree = Phylo.read(tree_path+"/"+values["norm"], "newick")
        orig_biotree = Phylo.read(tree_path+"/"+values["orig"], "newick")
        
        norm_biotree, uncertain_nodes = add_tree_labels(norm_biotree)
            
        edited_tree = io.StringIO()
        Phylo.write(norm_biotree, edited_tree, "newick")
        norm_tree = edited_tree.getvalue().replace(":0.00000","")
        
        edited_tree = io.StringIO()
        Phylo.write(orig_biotree, edited_tree, "newick")
        orig_tree = edited_tree.getvalue().replace(":0.00000","")
        
        norm_phylo = Phylogeny(
                id=tree_id+"_norm",
                name=str(data["Source"][values["source"]]) + " (Normalized)",
                markup_description=util.generate_markup("Source: src:"+values["source"])+
                "<br>This is a normalized version of <a href='/phylogeny/%s_orig'>this original tree</a>." % tree_id +
                util.generate_markup(
                    "<br>Comments: %s" % values["comment"]
                ),
                newick=norm_tree
        )
        
        if values["o_comment"] == None:
            o_comment = ""
        else:
            o_comment = values["o_comment"]
        orig_phylo = Phylogeny(
                id=tree_id+"_orig",
                name=str(data["Source"][values["source"]]) + " (Original)",
                markup_description=util.generate_markup("Source: src:"+values["source"])+
                    "<br>This is a representation of the original classification. A normalized version can be found <a href='/phylogeny/%s_norm'>here</a>." % tree_id +
                    util.generate_markup(
                    "<br>Comments: %s" % values["comment"] +
                    " " + o_comment
                    ),
                newick=orig_tree
        )
        for l in DBSession.query(common.Language):
            lname = l.name.replace("'", "’")
            if l.id in uncertain_nodes: lname += "?"
            new_label = LanguageTreeLabel(
                language=l,
                treelabel=TreeLabel(
                    id="%s_%s" % (tree_id, l.id),
                    name=lname,
                    phylogeny=norm_phylo
                )
            )
        DBSession.add(norm_phylo)
        DBSession.add(orig_phylo)

    print("Adding t-adding verb cognate sets…")
    for t_verb_set in args.cldf["cariban_t_cognates.csv"]:
        cognate_ID = "t"+t_verb_set["ID"]
        rec_t_form = "*[%s]%s" % (t_prefix_form(t_verb_set["Form"]), t_verb_set["Form"])
        t_cogset = data.add(models.Cognateset,
            cognate_ID,
            id=cognate_ID,
            name=rec_t_form,
            description="‘%s’ (*t-adding verb)" % t_verb_set["Parameter_ID"],
            cogset_type="t_adding"
        )
        if t_verb_set["Source"]:
            bib_key = t_verb_set["Source"].split("[")[0]
            if len(t_verb_set["Source"].split("[")) > 1:
                pages = t_verb_set["Source"].split("[")[1].split("]")[0]
            else:
                pages = " "
            if bib_key in data["Source"]:
                source = data["Source"][bib_key]
                DBSession.add(models.CognatesetReference(
                    cognateset=t_cogset,
                    source=source,
                    key=source.id,
                    description=pages)
                    )
    
    print("Adding t-adding verbs…")
    t_langs = {}
    t_verbs = {}
    non_t_adding_lgs = ["ing","mac","kar","wmr","pan"]
    data.add(models.Meaning,
        "t_verb",
        id="t-verb",
        name="t-adding verb",
    )
    for t_verb_entry in args.cldf["cariban_t_verbs.csv"]:
        if t_verb_entry["Language_ID"] == "cari1283": continue
        cognate_ID = "t"+t_verb_entry["Cognateset_ID"]
        lang_id = get_lang_id(t_verb_entry["Language_ID"])
        morph_id = lang_id+"_"+cognate_ID
        if morph_id in data["Morpheme"].keys():
            if morph_id + "_2" in data["Morpheme"].keys():
                morph_id += "_3"
            else:
                morph_id += "_2"
        t_verb = data.add(models.Morpheme,
            morph_id,
            id=morph_id,
            morpheme_type="t_adding",
            name=t_verb_entry["Form"],
            language=data["Language"][lang_id],
        )
        DBSession.add(models.Cognate(
                cognateset=data["Cognateset"][cognate_ID],
                counterpart=t_verb
            )
        )
        if t_verb_entry["t"] == "y":
            t_verb.name = "[%s]%s" % (t_prefix_form(t_verb.name), t_verb.name)
            t_verb.markup_description = util.generate_markup("Shows cogset:t")
        if t_verb_entry["t"] == "?" and lang_id not in non_t_adding_lgs:
            t_verb.name = "[t-?]"+t_verb.name
            t_verb.markup_description = util.generate_markup("It is not known if this verb shows cogset:t")
        if t_verb_entry["t"] == "n":
            t_verb.markup_description = util.generate_markup("Does not show cogset:t")
        if lang_id not in t_langs.keys():
            t_langs[lang_id] = {"y": 0, "n": 0, "?": 0}
        if cognate_ID not in t_verbs.keys():
            t_verbs[cognate_ID] = {"y": 0, "n": 0, "?": 0}
        t_langs[lang_id][t_verb_entry["t"]] += 1
        if lang_id not in non_t_adding_lgs:
            t_verbs[cognate_ID][t_verb_entry["t"]] += 1
        if t_verb_entry["Source"]:
            add_morpheme_reference(t_verb, t_verb_entry["Source"])

        data.add(models.MorphemeFunction,
            "t_"+t_verb_entry["ID"],
            id="t_"+t_verb_entry["ID"],
            name="t-Verb %s" % t_verb_entry["Parameter_ID"],
            unit=t_verb,
            unitparameter=data["Meaning"]["t_verb"],
            construction=None
        )
    for lang, values in t_langs.items():
        data["Language"][lang].update_jsondata(t_values=values)
    for verb, values in t_verbs.items():
        # data["Cognateset"][verb].description += " (%s/%s)" % (str(values["y"]), str(values["n"]+values["y"]+values["?"]))
        data["Cognateset"][verb].markup_description = util.generate_markup("This verb occurs with obj:t- in %s of %s languages which show reflexes of cogset:t." % (str(values["y"]), str(values["n"]+values["y"]+values["?"])))

    print("Adding reconstructed lexemes…")
    proto_forms = {}
    for cogset in args.cldf["cariban_lexical_reconstructions.csv"]:
        proto_forms[cogset["ID"]] = cogset["Form"]

    first_found = []
    for entry in args.cldf["cariban_swadesh_list.csv"]:
        cognateset_ID = entry["Parameter_ID"].replace("/","_")+"-"+entry["Cognateset_ID"]
        if cognateset_ID not in data["Cognateset"]:
            if cognateset_ID in proto_forms:
                form = "*" + proto_forms[cognateset_ID].replace("; ", " / ")
            # else:
            #     form = ""
                data.add(models.Cognateset,
                    cognateset_ID,
                    id=cognateset_ID,
                    name=form,
                    description=cognateset_ID,
                    cogset_type="lexical"
                )
        lang_id = get_lang_id(entry["Language_ID"])
        if lang_id not in data["Language"]: continue
        function = entry["Parameter_ID"].replace(".","_")
        morph_id = entry["Language_ID"] + "_" + function
        if morph_id in first_found: continue
        first_found.append(morph_id)
        if function not in data["Meaning"].keys():
            data.add(models.Meaning,
                function,
                id=function,
                name=function,
                meaning_type="lexical"
            )
        morpheme = data.add(models.Morpheme,
                    morph_id,
                    id=morph_id,
                    morpheme_type="lexical",
                    name=entry["Value"][0],
                    language=data["Language"][lang_id],
                )
        data.add(models.MorphemeFunction,
            "%s:%s" % (morph_id, function),
            id="%s:%s" % (morph_id, function),
            name="MorphemeFunction %s:%s"% (morph_id, function),
            unit=data["Morpheme"][morph_id],
            unitparameter=data["Meaning"][function],
            construction=None
        )
        if entry["Source"]:
            add_morpheme_reference(morpheme, entry["Source"])
        
        if cognateset_ID in proto_forms:
            DBSession.add(models.Cognate(
                    cognateset=data["Cognateset"][cognateset_ID],
                    counterpart=morpheme
                )
            )
Esempio n. 19
0
def main(args):
    data = Data()

    editors = OrderedDict()
    editors['Susanne Maria Michaelis'] = None
    editors['Philippe Maurer'] = None
    editors['Martin Haspelmath'] = None
    editors['Magnus Huber'] = None

    for row in read(args, 'People'):
        name = row['First name'] + ' ' if row['First name'] else ''
        name += row['Last name']
        kw = dict(
            name=name,
            id=slug('%(Last name)s%(First name)s' % row),
            url=row['Contact Website'].split()[0]
            if row['Contact Website'] else None,
            address=row['Comments on database'],
        )
        contrib = data.add(common.Contributor, row['Author ID'], **kw)
        if kw['name'] in editors:
            editors[kw['name']] = contrib

    DBSession.flush()

    dataset = common.Dataset(
        id='apics',
        name='APiCS Online',
        description='Atlas of Pidgin and Creole Language Structures Online',
        domain='apics-online.info',
        published=date(2013, 11, 4),
        license='http://creativecommons.org/licenses/by/3.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 3.0 Unported License'
        })
    DBSession.add(dataset)
    for i, editor in enumerate(editors.values()):
        common.Editor(dataset=dataset, contributor=editor, ord=i + 1)

    colors = dict(
        (row['ID'], row['RGB_code']) for row in read(args, 'Colours'))

    abbrs = {}
    for id_, name in LGR_ABBRS.items():
        DBSession.add(common.GlossAbbreviation(id=id_, name=name))
        abbrs[id_] = 1

    for id_, name in {
            'C**T': 'clitic',
            'IMPF': 'imperfect',
            'INTERM': 'intermediate',
            'NCOMPL': 'noncompletive',
            'NONFUT': 'nonfuture',
            'NPROX': 'nonproximal',
            'NSG': 'nonsingular',
            'PP': 'past participle',
            'PROP': 'proprietive',
            'TMA': 'tense-mood-aspect',
    }.items():
        DBSession.add(common.GlossAbbreviation(id=id_, name=name))
        abbrs[id_] = 1

    for row in reader(args.data_file('non-lgr-gloss-abbrs.csv'),
                      delimiter=',',
                      namedtuples=True):
        for match in GLOSS_ABBR_PATTERN.finditer(row.standard):
            if match.group('abbr') not in abbrs:
                abbrs[match.group('abbr')] = 1
                DBSession.add(
                    common.GlossAbbreviation(id=match.group('abbr'),
                                             name=row.meaning))

    non_bibs = {}
    for row in read(args, 'References', 'Reference_ID'):
        if row['Reference_type'] == 'Non-bib':
            non_bibs[row['Reference_ID']] = row['Reference_name']
            continue

        if isinstance(row['Year'], int):
            year_int = row['Year']
            year = str(row['Year'])
        elif row['Year']:
            year_int = None
            for m in re.finditer('(?P<year>(1|2)[0-9]{3})', row['Year']):
                year_int = int(m.group('year'))
                break
            year = row['Year']
        else:
            year, year_int = None, None

        title = row['Article_title'] or row['Book_title']
        attrs = {}
        jsondata = {}
        for attr, field in {
                'Additional_information': 'note',
                'Article_title': 'title',
                'Book_title': 'booktitle',
                'City': 'address',
                'Editors': 'editor',
                'Full_reference': None,
                'Issue': None,
                'Journal': 'journal',
                'Language_codes': None,
                'LaTeX_cite_key': None,
                'Pages': 'pages',
                'Publisher': 'publisher',
                'Reference_type': 'type',
                'School': 'school',
                'Series_title': 'series',
                'URL': 'url',
                'Volume': 'volume',
        }.items():
            value = row.get(attr)
            if not isinstance(value, int):
                value = (value or '').strip()
            if attr == 'Issue' and value:
                try:
                    value = str(int(value))
                except ValueError:
                    pass
            if value:
                if field:
                    attrs[field] = value
                else:
                    jsondata[attr] = value
        p = data.add(common.Source,
                     row['Reference_ID'],
                     id=str(row['Reference_ID']),
                     name=row['Reference_name'],
                     description=title,
                     author=row['Authors'],
                     year=year,
                     year_int=year_int,
                     bibtex_type=getattr(EntryType, row['BibTeX_type']
                                         or 'misc'),
                     jsondata=jsondata,
                     **attrs)
        if p.bibtex_type.value == 'misc' and not p.description:
            p.description = p.note
        DBSession.flush()

    DBSession.flush()

    infobox = jsonload(args.data_file('infobox.json'))
    glottocodes = jsonload(args.data_file('glottocodes.json'))
    for row in read(args, 'Languages', 'Order_number'):
        lon, lat = [
            float(c.strip()) for c in row['map_coordinates'].split(',')
        ]
        kw = dict(
            name=row['Language_name'],
            id=str(row['Order_number']),
            latitude=lat,
            longitude=lon,
            region=row['Category_region'],
        )
        lect = data.add(models.Lect, row['Language_ID'], **kw)
        DBSession.flush()

        for i, item in enumerate(infobox[lect.id]):
            DBSession.add(
                common.Language_data(object_pk=lect.pk,
                                     ord=i,
                                     key=item[0],
                                     value=item[1]))

        if row["Languages_contribution_documentation::Lect_description_checked_status"] \
                != "Checked":
            print 'unchecked! ---', row['Language_name']

        desc = row.get(
            'Languages_contribution_documentation::Lect description', '')
        markup_desc = normalize_markup(row[
            'Languages_contribution_documentation::z_calc_GetAsCSS_Lect_description']
                                       )

        c = data.add(
            models.ApicsContribution,
            row['Language_ID'],
            id=str(row['Order_number']),
            name=row['Language_name'],
            description=desc,
            markup_description=markup_desc,
            survey_reference=data['Source'][row['Survey_reference_ID']],
            language=lect)

        for ext, label, mtype in [
            ('pdf', 'Glossed text', 'application/pdf'),
            ('mp3', 'Glossed text audio', 'audio/mpeg'),
        ]:
            fid = '%s-gt.%s' % (c.id, ext)
            if args.data_file('files', 'contribution', c.id, fid).exists():
                common.Contribution_files(object=c,
                                          id=fid,
                                          name=label,
                                          mime_type=mtype)
            else:
                print label, 'missing for:', row['Language_name']

        #
        # TODO: for michif, 75, add link http://www.youtube.com/watch?v=f0C4cODsSyE
        #

        iso = None
        if row['ISO_code'] and len(row['ISO_code']) == 3:
            iso = row['ISO_code'].lower()
            if 'iso:%s' % row['ISO_code'] not in data['Identifier']:
                data.add(common.Identifier,
                         'iso:%s' % row['ISO_code'],
                         id=row['ISO_code'].lower(),
                         name=row['ISO_code'].lower(),
                         type=common.IdentifierType.iso.value)

            DBSession.add(
                common.LanguageIdentifier(
                    language=data['Lect'][row['Language_ID']],
                    identifier=data['Identifier']['iso:%s' % row['ISO_code']]))

        if lect.id in glottocodes:
            identifier = data.add(common.Identifier,
                                  'gc:%s' % glottocodes[lect.id],
                                  id=glottocodes[lect.id],
                                  name=glottocodes[lect.id],
                                  type=common.IdentifierType.glottolog.value)

            DBSession.add(
                common.LanguageIdentifier(
                    language=data['Lect'][row['Language_ID']],
                    identifier=identifier))

        if row['Language_name_ethnologue']:
            if row['Language_name_ethnologue'] not in data['Identifier']:
                data.add(common.Identifier,
                         row['Language_name_ethnologue'],
                         id=iso
                         or 'ethnologue:%s' % row['Language_name_ethnologue'],
                         name=row['Language_name_ethnologue'],
                         type='ethnologue')

            DBSession.add(
                common.LanguageIdentifier(
                    language=data['Lect'][row['Language_ID']],
                    identifier=data['Identifier'][
                        row['Language_name_ethnologue']]))

    example_count = {}
    for row in read(args, 'Examples', 'Order_number'):
        assert row['Language_ID']
        lang = data['Lect'][row['Language_ID']]
        id_ = '%(Language_ID)s-%(Example_number)s' % row
        atext, gloss = igt(row)
        example_count[row['Language_ID']] = max(
            [example_count.get(row['Language_ID'], 1), row['Example_number']])
        p = add_sentence(
            args,
            data,
            id_,
            id='%s-%s' % (lang.id, row['Example_number']),
            name=row['Text'] or row['Analyzed_text'],
            description=row['Translation'],
            type=row['Type'].strip().lower() if row['Type'] else None,
            comment=row['Comments'],
            gloss=gloss,
            analyzed=atext,
            markup_text=normalize_markup(row['z_calc_Text_CSS']),
            markup_gloss=normalize_markup(row['z_calc_Gloss_CSS']),
            markup_comment=normalize_markup(row['z_calc_Comments_CSS']),
            markup_analyzed=normalize_markup(row['z_calc_Analyzed_text_CSS']),
            original_script=row['Original_script'],
            jsondata={
                'sort': row['Order_number'],
                'alt_translation': (row['Translation_other'] or '').strip()
                or None
            },
            language=lang)

        if row['Reference_ID']:
            if row['Reference_ID'] in data['Source']:
                source = data['Source'][row['Reference_ID']]
                DBSession.add(
                    common.SentenceReference(
                        sentence=p,
                        source=source,
                        key=source.id,
                        description=row['Reference_pages']))
            else:
                p.source = non_bibs[row['Reference_ID']]

    DBSession.flush()

    for row in read(args, 'Language_references'):
        if row['Reference_ID'] not in data['Source']:
            assert row['Reference_ID'] in non_bibs
            continue
        assert row['Language_ID'] in data['ApicsContribution']
        source = data['Source'][row['Reference_ID']]
        DBSession.add(
            common.ContributionReference(
                contribution=data['ApicsContribution'][row['Language_ID']],
                source=source,
                description=row['Pages'],
                key=source.id))

    #
    # global counter for features - across feature types
    #
    feature_count = 0
    for row in read(args, 'Features', 'Feature_number'):
        id_ = str(row['Feature_number'])
        if int(id_) > feature_count:
            feature_count = int(id_)
        wals_id = None
        desc = row['Feature_annotation_publication']
        if row['WALS_match'] == 'Total':
            if isinstance(row['WALS_No.'], int):
                wals_id = row['WALS_No.']
            else:
                wals_id = int(row['WALS_No.'].split('.')[0].strip())

        p = data.add(models.Feature,
                     row['Feature_code'],
                     name=row['Feature_name'],
                     id=id_,
                     description=desc,
                     markup_description=normalize_markup(
                         row['z_calc_Feature_annotation_publication_CSS']),
                     feature_type='primary',
                     multivalued=row['Value_relation_type'] != 'Single',
                     area=row['Feature_area'],
                     wals_id=wals_id)

        names = {}
        for i in range(1, 10):
            if not row['Value%s_publication' % i] \
                    or not row['Value%s_publication' % i].strip():
                continue
            name = row['Value%s_publication' % i].strip()
            if name in names:
                name += ' (%s)' % i
            names[name] = 1
            de = data.add(
                common.DomainElement,
                '%s-%s' % (row['Feature_code'], i),
                id='%s-%s' % (id_, i),
                name=name,
                parameter=p,
                abbr=row['Value%s_for_book_maps' % i] if p.id != '0' else name,
                number=int(row['Value%s_value_number_for_publication' % i]),
                jsondata={'color': colors[row['Value_%s_colour_ID' % i]]},
            )
            assert de

        if row['Authors_FeatureArticles']:
            authors, _ = row['Authors_FeatureArticles'].split('and the APiCS')
            authors = authors.strip()
            if authors.endswith(','):
                authors = authors[:-1].strip()
            for i, name in enumerate(authors.split(',')):
                assert name.strip() in editors
                p._authors.append(
                    models.FeatureAuthor(ord=i + 1,
                                         contributor=editors[name.strip()]))

        DBSession.flush()

    primary_to_segment = {123: 63, 126: 35, 128: 45, 130: 41}
    segment_to_primary = dict(
        zip(primary_to_segment.values(), primary_to_segment.keys()))
    number_map = {}
    names = {}
    for row in read(args, 'Segment_features', 'Order_number'):
        symbol = row['Segment_symbol']
        if row['Segment_name'] == 'voiceless dental/alveolar sibilant affricate':
            symbol = 't\u0361s'
        truth = lambda s: s and s.strip().lower() == 'yes'
        name = '%s - %s' % (symbol, row['Segment_name'])

        if name in names:
            number_map[row['Segment_feature_number']] = names[name]
            continue

        number_map[
            row['Segment_feature_number']] = row['Segment_feature_number']
        names[name] = row['Segment_feature_number']
        feature_count += 1
        if row['Segment_feature_number'] in segment_to_primary:
            primary_to_segment[segment_to_primary[row['Segment_feature_number']]]\
                = str(feature_count)
        p = data.add(models.Feature,
                     row['Segment_feature_number'],
                     name=name,
                     id=str(feature_count),
                     feature_type='segment',
                     area='Vowels' if truth(row['Vowel']) else
                     ('Obstruent consonants'
                      if truth(row['Obstruent']) else 'Sonorant consonants'),
                     jsondata=dict(
                         number=int(row['Segment_feature_number']),
                         vowel=truth(row['Vowel']),
                         consonant=truth(row['Consonant']),
                         obstruent=truth(row['Obstruent']),
                         core_list=truth(row['Core_list_segment']),
                         symbol=symbol,
                     ))

        for i, spec in SEGMENT_VALUES.items():
            data.add(common.DomainElement,
                     '%s-%s' % (row['Segment_feature_number'], spec[0]),
                     id='%s-%s' % (p.id, i),
                     name=spec[0],
                     parameter=p,
                     jsondata={'color': spec[1]},
                     number=i)

    print '--> remapped:', primary_to_segment
    DBSession.flush()

    for row in read(args, 'Sociolinguistic_features',
                    'Sociolinguistic_feature_number'):
        feature_count += 1
        p = data.add(models.Feature,
                     row['Sociolinguistic_feature_code'],
                     name=row['Sociolinguistic_feature_name'],
                     id='%s' % feature_count,
                     description=row['Sociolinguistic_feature_annotation'],
                     area='Sociolinguistic',
                     feature_type='sociolinguistic')

        names = {}

        for i in range(1, 10):
            id_ = '%s-%s' % (row['Sociolinguistic_feature_code'], i)
            if row.get('Value%s' % i) and row['Value%s' % i].strip():
                name = row['Value%s' % i].strip()
                if name in names:
                    name += ' (%s)' % i
                names[name] = 1
            else:
                continue
            kw = dict(id='%s-%s' % (p.id, i), name=name, parameter=p, number=i)
            data.add(common.DomainElement,
                     id_,
                     id='%s-%s' % (p.id, i),
                     name=name,
                     parameter=p,
                     number=i,
                     jsondata={
                         'color':
                         colors.get(row['Value%s_colour_ID' % i],
                                    colors.values()[i])
                     })

    sd = {}
    for row in read(args, 'Segment_data'):
        if row['Segment_feature_number'] not in number_map:
            continue
        number = number_map[row['Segment_feature_number']]

        if not row['Presence_in_the_language']:
            continue

        lang = data['Lect'][row['Language_ID']]
        param = data['Feature'][number]
        id_ = '%s-%s' % (lang.id, param.id)
        if id_ in sd:
            assert row['c_Record_is_a_duplicate'] == 'Yes'
            continue
        sd[id_] = 1
        valueset = data.add(
            common.ValueSet,
            id_,
            id=id_,
            parameter=param,
            language=lang,
            contribution=data['ApicsContribution'][row['Language_ID']],
            description=row['Comments'],
            markup_description=normalize_markup(row['z_calc_Comments_CSS']),
        )
        v = data.add(
            common.Value,
            id_,
            id=id_,
            frequency=float(100),
            valueset=valueset,
            domainelement=data['DomainElement'][
                '%s-%s' % (number, row['Presence_in_the_language'])],
        )
        if row['Example_word'] and row['Example_word_gloss']:
            example_count[row['Language_ID']] += 1
            p = add_sentence(args,
                             data,
                             '%s-p%s' % (lang.id, data['Feature'][number].id),
                             id='%s-%s' %
                             (lang.id, example_count[row['Language_ID']]),
                             name=row['Example_word'],
                             description=row['Example_word_gloss'],
                             language=lang)
            DBSession.add(common.ValueSentence(value=v, sentence=p))

        source = data['Source'].get(row['Refers_to_references_Reference_ID'])
        if source:
            DBSession.add(
                common.ValueSetReference(valueset=valueset,
                                         source=source,
                                         key=source.id))
        elif row['Refers_to_references_Reference_ID'] in non_bibs:
            valueset.source = non_bibs[
                row['Refers_to_references_Reference_ID']]

    lects = defaultdict(lambda: 1)
    lect_map = {}
    records = {}
    false_values = {}
    no_values = {}
    wals_value_number = {}
    for row in read(args, 'wals'):
        if row['z_calc_WALS_value_number']:
            wals_value_number[
                row['Data_record_id']] = row['z_calc_WALS_value_number']

    def prefix(attr, _prefix):
        if _prefix:
            return '%s_%s' % (_prefix, attr)
        return attr.capitalize()

    for _prefix, abbr in [('', ''), ('Sociolinguistic', 'sl')]:
        num_values = 10
        for row in read(args, prefix('data', _prefix)):
            if not row[prefix('feature_code', _prefix)]:
                print('no associated feature for', prefix('data', _prefix),
                      row[prefix('data_record_id', _prefix)])
                continue

            lid = row['Language_ID']
            lect_attr = row.get('Lect_attribute', 'my default lect').lower()
            if lect_attr != 'my default lect':
                if (row['Language_ID'], row['Lect_attribute']) in lect_map:
                    lid = lect_map[(row['Language_ID'], row['Lect_attribute'])]
                else:
                    lang = data['Lect'][row['Language_ID']]
                    c = lects[row['Language_ID']]
                    lid = '%s-%s' % (row['Language_ID'], c)
                    kw = dict(
                        name='%s (%s)' % (lang.name, row['Lect_attribute']),
                        id='%s' % (1000 + 10 * int(lang.id) + c),
                        latitude=lang.latitude,
                        longitude=lang.longitude,
                        description=row['Lect_attribute'],
                        language=lang,
                    )
                    data.add(models.Lect, lid, **kw)
                    lects[row['Language_ID']] += 1
                    lect_map[(row['Language_ID'], row['Lect_attribute'])] = lid

            id_ = abbr + str(row[prefix('data_record_id', _prefix)])
            assert id_ not in records
            records[id_] = 1

            assert row[prefix('feature_code', _prefix)] in data['Feature']
            language = data['Lect'][lid]
            parameter = data['Feature'][row[prefix('feature_code', _prefix)]]
            valueset = common.ValueSet(
                id='%s-%s' % (language.id, parameter.id),
                description=row['Comments_on_value_assignment'],
                markup_description=normalize_markup(
                    row.get('z_calc_Comments_on_value_assignment_CSS')),
            )

            values_found = {}
            for i in range(1, num_values):
                if not row['Value%s_true_false' % i]:
                    continue

                if row['Value%s_true_false' % i].strip().lower() != 'true':
                    assert row['Value%s_true_false' %
                               i].strip().lower() == 'false'
                    false_values[row[prefix('data_record_id', _prefix)]] = 1
                    continue

                iid = '%s-%s' % (row[prefix('feature_code', _prefix)], i)
                if iid not in data['DomainElement']:
                    print(iid, row[prefix('data_record_id',
                                          _prefix)], '--> no domainelement!')
                    continue
                values_found['%s-%s' % (id_, i)] = dict(
                    id='%s-%s' % (valueset.id, i),
                    domainelement=data['DomainElement']['%s-%s' % (row[prefix(
                        'feature_code', _prefix)], i)],
                    confidence=row['Value%s_confidence' % i],
                    frequency=float(row['c_V%s_frequency_normalised' %
                                        i]) if _prefix == '' else 100)

            if values_found:
                if row[prefix('data_record_id', _prefix)] in wals_value_number:
                    valueset.jsondata = {
                        'wals_value_number':
                        wals_value_number.pop(row[prefix(
                            'data_record_id', _prefix)])
                    }
                valueset.parameter = parameter
                valueset.language = language
                valueset.contribution = data['ApicsContribution'][
                    row['Language_ID']]
                valueset = data.add(common.ValueSet, id_, _obj=valueset)
                for i, item in enumerate(values_found.items()):
                    if i > 0 and not parameter.multivalued:
                        print 'multiple values for single-valued parameter: %s' % id_
                        break
                    id_, kw = item
                    kw['valueset'] = valueset
                    value = data.add(common.Value, id_, **kw)

                #
                # store references to additional data for segments which should be reused
                # for corresponding primary features!
                #
                if int(parameter.id) in primary_to_segment:
                    assert len(values_found) == 1
                    seg_id = '%s-%s' % (language.id, primary_to_segment[int(
                        parameter.id)])
                    seg_valueset = data['ValueSet'][seg_id]
                    seg_value = data['Value'][seg_id]
                    if not valueset.description and seg_valueset.description:
                        valueset.description = seg_valueset.description

                    for s in seg_value.sentence_assocs:
                        DBSession.add(
                            common.ValueSentence(value=value,
                                                 sentence=s.sentence))

                    for r in seg_valueset.references:
                        DBSession.add(
                            common.ValueSetReference(valueset=valueset,
                                                     source=r.source,
                                                     key=r.key))

                    if not valueset.source and seg_valueset.source:
                        valueset.source = seg_valueset.source

                DBSession.flush()
            else:
                no_values[id_] = 1

    DBSession.flush()

    for prefix, abbr, num_values in [
        ('D', '', 10),
        ('Sociolinguistic_d', 'sl', 7),
    ]:
        for row in read(args, prefix + 'ata_references'):
            assert row['Reference_ID'] in data['Source'] \
                or row['Reference_ID'] in non_bibs
            try:
                vs = data['ValueSet'][abbr +
                                      str(row[prefix + 'ata_record_id'])]
                if row['Reference_ID'] in data['Source']:
                    source = data['Source'][row['Reference_ID']]
                    DBSession.add(
                        common.ValueSetReference(
                            valueset=vs,
                            source=source,
                            key=source.id,
                            description=row['Pages'],
                        ))
                else:
                    if vs.source:
                        vs.source += '; ' + non_bibs[row['Reference_ID']]
                    else:
                        vs.source = non_bibs[row['Reference_ID']]
            except KeyError:
                continue

    DBSession.flush()

    missing = 0
    for row in read(args, 'Value_examples'):
        try:
            DBSession.add(
                common.ValueSentence(
                    value=data['Value']['%(Data_record_id)s-%(Value_number)s' %
                                        row],
                    sentence=data['Sentence'][
                        '%(Language_ID)s-%(Example_number)s' % row],
                    description=row['Notes'],
                ))
        except KeyError:
            missing += 1
    print('%s Value_examples are missing data' % missing)

    print('%s data sets with false values' % len(false_values))
    print('%s data sets without values' % len(no_values))

    for k, v in wals_value_number.items():
        print 'unclaimed wals value number:', k, v

    for i, row in enumerate(read(args, 'Contributors')):
        kw = dict(contribution=data['ApicsContribution'][row['Language ID']],
                  contributor=data['Contributor'][row['Author ID']])
        if row['Order_of_appearance']:
            kw['ord'] = int(float(row['Order_of_appearance']))
        data.add(common.ContributionContributor, i, **kw)

    DBSession.flush()
Esempio n. 20
0
def populate_test_db(engine):
    set_alembic_version(engine, '58559d4eea0d')

    data = TestData()
    data.add_default(common.Dataset,
                     domain='clld',
                     jsondata={
                         'license_icon': 'cc-by',
                         'license_url': 'http://example.org'
                     })

    data.add_default(common.Contributor, name='A Name', email='*****@*****.**')
    for id_, name in {
            'b': 'b Name',
            'c': 'c Name',
            'd': 'd Name',
    }.items():
        data.add(common.Contributor,
                 id_,
                 id=id_,
                 name=name,
                 url='http://example.org')

    DBSession.add(
        common.Editor(dataset=data[common.Dataset],
                      contributor=data[common.Contributor]))

    data.add_default(common.Source)
    data.add(common.Source,
             'replaced',
             id='replaced',
             active=False,
             jsondata={'__replacement_id__': 'source'})

    data.add_default(common.Contribution)
    common.ContributionReference(contribution=data[common.Contribution],
                                 source=data[common.Source])

    for primary, c in [(True, 'contributor'), (False, 'b'), (True, 'c'),
                       (False, 'd')]:
        common.ContributionContributor(contribution=data[common.Contribution],
                                       primary=primary,
                                       contributor=data['Contributor'][c])

    data.add_default(common.Language, latitude=10.5, longitude=0.3)
    data[common.Language].sources.append(data[common.Source])

    for i, type_ in enumerate(common.IdentifierType):
        common.LanguageIdentifier(
            language=data[common.Language],
            identifier=common.Identifier(
                type=type_.value,
                id=type_.value + str(i),
                name='abc' if type_.name == 'iso' else 'glot1234'))

    common.LanguageIdentifier(language=data[common.Language],
                              identifier=common.Identifier(type='name',
                                                           id='name',
                                                           name='a'))

    for i in range(2, 102):
        _l = common.Language(id='l%s' % i, name='Language %s' % i)
        _i = common.Identifier(type='iso639-3', id='%.3i' % i, name='abc')
        common.LanguageIdentifier(language=_l, identifier=_i)
        DBSession.add(_l)

    param = data.add_default(common.Parameter)
    de = common.DomainElement(id='de', name='DomainElement', parameter=param)
    de2 = common.DomainElement(id='de2',
                               name='DomainElement2',
                               parameter=param)

    valueset = data.add_default(common.ValueSet,
                                language=data[common.Language],
                                parameter=param,
                                contribution=data[common.Contribution])
    common.ValueSetReference(valueset=valueset,
                             source=data[common.Source],
                             description='10-20')

    data.add_default(common.Value,
                     domainelement=de,
                     valueset=valueset,
                     frequency=50,
                     confidence='high')
    data.add(common.Value,
             'value2',
             id='value2',
             domainelement=de2,
             valueset=valueset,
             frequency=50,
             confidence='high')

    paramnd = data.add(common.Parameter,
                       'no-domain',
                       id='no-domain',
                       name='Parameter without domain')
    valueset = common.ValueSet(id='vs2',
                               language=data[common.Language],
                               parameter=paramnd,
                               contribution=data[common.Contribution])

    common.ValueSetReference(valueset=valueset,
                             source=data[common.Source],
                             description='10-20')
    common.Value(id='v2', valueset=valueset, frequency=50, confidence='high')

    unit = data.add_default(common.Unit, language=data[common.Language])
    up = data.add_default(common.UnitParameter)
    common.UnitValue(id='unitvalue',
                     name='UnitValue',
                     unit=unit,
                     unitparameter=up)

    up2 = common.UnitParameter(id='up2', name='UnitParameter with domain')
    de = common.UnitDomainElement(id='de', name='de', parameter=up2)
    DBSession.add(
        common.UnitValue(id='uv2',
                         name='UnitValue2',
                         unit=unit,
                         unitparameter=up2,
                         unitdomainelement=de))

    DBSession.add(common.Source(id='s'))

    sentence = data.add_default(common.Sentence,
                                description='sentence description',
                                analyzed='a\tmorpheme\tdoes\tdo',
                                gloss='a\tmorpheme\t1SG\tdo.SG2',
                                source='own',
                                comment='comment',
                                original_script='a morpheme',
                                language=data[common.Language],
                                jsondata={'alt_translation': 'Spanish: ...'})
    common.SentenceReference(sentence=sentence, source=data[common.Source])
    DBSession.add(common.Config(key='key', value='value'))

    common.Config.add_replacement('replaced',
                                  'language',
                                  model=common.Language)
    common.Config.add_replacement('gone', None, model=common.Language)
    DBSession.flush()
Esempio n. 21
0
def main(args):
    assert args.glottolog, 'The --glottolog option is required!'

    clts = CLTS(input('Path to cldf-clts/clts:') or '../../cldf-clts/clts')
    data = Data()
    ds = data.add(
        common.Dataset,
        lsi.__name__,
        id=lsi.__name__,
        name=
        'The Comparative Vocabularies of the "Linguistic Survey of India" Online',
        domain='lsi.clld.org',
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        jsondata={
            'license_icon':
            'cc-by.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License'
        },
    )

    for i, name in enumerate(
        ['Taraka Rama', 'Robert Forkel', 'Johann-Mattis List']):
        common.Editor(dataset=ds,
                      ord=i,
                      contributor=common.Contributor(id=slug(
                          HumanName(name).last),
                                                     name=name))

    contrib = data.add(
        common.Contribution,
        None,
        id='cldf',
        name=args.cldf.properties.get('dc:title'),
        description=args.cldf.properties.get('dc:bibliographicCitation'),
    )

    for lang in iteritems(args.cldf, 'LanguageTable', 'id', 'glottocode',
                          'name', 'latitude', 'longitude'):
        data.add(
            models.Variety,
            lang['id'],
            id=lang['id'],
            name=lang['name'],
            latitude=lang['latitude'],
            longitude=lang['longitude'],
            glottocode=lang['glottocode'],
            order=int(lang['Order']),
            number=lang['NumberInSource'],
            family_in_source=lang['FamilyInSource'],
        )

    for rec in bibtex.Database.from_file(args.cldf.bibpath):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    refs = collections.defaultdict(list)

    for param in iteritems(args.cldf, 'ParameterTable', 'id',
                           'concepticonReference', 'name'):
        data.add(
            models.Concept,
            param['id'],
            id=param['id'],
            name='{} [{}]'.format(param['name'], param['id']),
            description=param['Concepticon_Gloss'],
            concepticon_id=param['concepticonReference'],
            pages=param['PageNumber'],
        )

    inventories = collections.defaultdict(set)
    for form in iteritems(args.cldf, 'FormTable', 'id', 'form',
                          'languageReference', 'parameterReference', 'source'):
        inventories[form['languageReference']] = inventories[
            form['languageReference']].union(form['Segments'])
        vsid = (form['languageReference'], form['parameterReference'])
        vs = data['ValueSet'].get(vsid)
        if not vs:
            vs = data.add(
                common.ValueSet,
                vsid,
                id='-'.join(vsid),
                language=data['Variety'][form['languageReference']],
                parameter=data['Concept'][form['parameterReference']],
                contribution=contrib,
            )
        for ref in form.get('source', []):
            sid, pages = Sources.parse(ref)
            refs[(vsid, sid)].append(pages)
        data.add(
            models.Form,
            form['id'],
            id=form['id'],
            name=form['form'],
            description=''.join(form['Segments']).replace('+', ' '),
            segments=' '.join(form['Segments']),
            valueset=vs,
        )
    for lid, inv in inventories.items():
        inv = [clts.bipa[c] for c in inv]
        data['Variety'][lid].update_jsondata(inventory=[(str(c), c.name)
                                                        for c in inv
                                                        if hasattr(c, 'name')])

    for (vsid, sid), pages in refs.items():
        DBSession.add(
            common.ValueSetReference(valueset=data['ValueSet'][vsid],
                                     source=data['Source'][sid],
                                     description='; '.join(nfilter(pages))))
    load_families(
        Data(),
        [(l.glottocode, l) for l in data['Variety'].values()],
        glottolog_repos=args.glottolog,
        isolates_icon='tcccccc',
        strict=False,
    )
Esempio n. 22
0
def main(args):
    data = Data()

    for rec in Database.from_file(
            data_path('references.bib'), lowercase=False):
        source = data.add(common.Source, rec.id, _obj=bibtex2source(rec))
    
    dataset = common.Dataset(
        id=clts.__name__,
        name="CLTS",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        contact='*****@*****.**',
        domain='clts.clld.org',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})
    DBSession.add(dataset)
    for i, name in enumerate(['Johann-Mattis List', 'Cormac Anderson', 'Tiago Tresoldi', 
        'Thiago Chacon', 'Robert Forkel']):
        c = common.Contributor(id=slug(name), name=name)
        dataset.editors.append(common.Editor(contributor=c, ord=i))

    for i, line in enumerate(reader(data_path('sounds.tsv'), delimiter='\t',
            namedtuples=True)):
        if not i % 100:
            print('-', end="")
        key = line.NAME.replace(' ', '_')
        data.add(
                models.SoundSegment,
                key,
                id=key,
                name = line.NAME,
                grapheme=line.GRAPHEME,
                aliases=line.ALIASES,
                representation=len(line.REFLEXES.split(',')),
                reflexes = line.REFLEXES,
                generated = True if line.GENERATED else False,
                unicode = line.UNICODE,
                )
    print('')
    english = data.add(
        common.Language, 'eng',
        id='eng',
        name='English')



    contributions = {}
    for line in reader(data_path('datasets.tsv'),
            delimiter='\t', namedtuples=True):
        contributions[line.NAME] = data.add(
                models.CLTSDataSet,
                line.NAME,
                id=line.NAME,
                name=line.NAME,
                description=line.DESCRIPTION,
                datatype=line.TYPE
                )
        for id_ in line.REFS.split(', '):
            common.ContributionReference(
                    source=data['Source'][id_],
                    contribution=contributions[line.NAME])
    
        
    visited = set()
    for i, line in enumerate(reader(data_path('graphemes.tsv'), delimiter="\t",
            namedtuples=True)):
        if not i % 100: print('-', end='')
        key = line.DATASET + ':' + line.NAME+':'+line.GRAPHEME
        if key not in visited:
            sound_id = line.NAME.replace(' ', '_')
            vs = common.ValueSet(
                    id=key,
                    description=line.NAME,
                    language=english,
                    contribution=contributions[line.DATASET],
                    parameter=data['SoundSegment'][sound_id]
                    )
            data.add(
                    models.Grapheme,
                    key,
                    id=key,
                    grapheme=line.GRAPHEME,
                    bipa_grapheme=line.BIPA,
                    name=line.NAME,
                    dataset=line.DATASET,
                    datatype=line.DATATYPE,
                    frequency=line.FREQUENCY or 0,
                    image=line.IMAGE,
                    url=line.URL,
                    valueset=vs
                    )
            visited.add(key)
    print('-')
Esempio n. 23
0
def main(args):
    fts.index('fts_index', Word.fts, DBSession.bind)
    DBSession.execute("CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;")

    data = Data()

    dataset = common.Dataset(
        id=dictionaria.__name__,
        name="Dictionaria",
        description="The Dictionary Journal",
        published=date(2017, 3, 30),
        contact='*****@*****.**',
        domain='dictionaria.clld.org',
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="https://shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})

    for i, (id_, name) in enumerate([
        ('haspelmathmartin', 'Martin Haspelmath'),
        ('moselulrike', 'Ulrike Mosel'),
        ('stiebelsbarbara', 'Barbara Stiebels')
    ]):
        ed = data.add(common.Contributor, id_, id=id_, name=name)
        common.Editor(dataset=dataset, contributor=ed, ord=i + 1)
    DBSession.add(dataset)

    for id_, name in LGR_ABBRS.items():
        DBSession.add(common.GlossAbbreviation(id=id_, name=name))

    comparison_meanings = {}

    print('loading concepts ...')

    glosses = set()
    concepticon = Concepticon(
        REPOS.joinpath('..', '..', 'concepticon', 'concepticon-data'))
    if not args.no_concepts:
        for conceptset in concepticon.conceptsets.values():
            if conceptset.gloss in glosses:
                continue
            glosses.add(conceptset.gloss)
            cm = data.add(
                ComparisonMeaning,
                conceptset.id,
                id=conceptset.id,
                name=conceptset.gloss.lower(),
                description=conceptset.definition,
                concepticon_url='http://concepticon.clld.org/parameters/%s' % conceptset.id)
            comparison_meanings[cm.id] = cm

    DBSession.flush()

    print('... done')

    comparison_meanings = {k: v.pk for k, v in comparison_meanings.items()}
    submissions = []

    for submission in REPOS.joinpath(
            'submissions-internal' if args.internal else 'submissions').glob('*'):
        if not submission.is_dir():
            continue

        try:
            submission = Submission(submission)
        except ValueError:
            continue

        md = submission.md
        if md is None:
            continue

        if not md['date_published']:
            continue

        id_ = submission.id
        if args.dict and args.dict != id_ and args.dict != 'all':
            continue
        lmd = md['language']
        props = md.get('properties', {})
        props.setdefault('custom_fields', [])
        props['metalanguage_styles'] = {}
        for v, s in zip(props.get('metalanguages', {}).values(),
                        ['success', 'info', 'warning', 'important']):
            props['metalanguage_styles'][v] = s
        props['custom_fields'] = ['lang-' + f if f in props['metalanguage_styles'] else f
                                  for f in props['custom_fields']]

        language = data['Variety'].get(lmd['glottocode'])
        if not language:
            language = data.add(
                Variety, lmd['glottocode'], id=lmd['glottocode'], name=lmd['name'])

        md['date_published'] = md['date_published'] or date.today().isoformat()
        if '-' not in md['date_published']:
            md['date_published'] = md['date_published'] + '-01-01'
        dictionary = data.add(
            Dictionary,
            id_,
            id=id_,
            number=md.get('number'),
            name=props.get('title', lmd['name'] + ' dictionary'),
            description=submission.description,
            language=language,
            published=date(*map(int, md['date_published'].split('-'))),
            jsondata=props)

        for i, spec in enumerate(md['authors']):
            if not isinstance(spec, dict):
                cname, address = spec, None
                spec = {}
            else:
                cname, address = spec['name'], spec.get('affiliation')
            name = HumanName(cname)
            cid = slug('%s%s' % (name.last, name.first))
            contrib = data['Contributor'].get(cid)
            if not contrib:
                contrib = data.add(
                    common.Contributor,
                    cid,
                    id=cid,
                    name=cname,
                    address=address,
                    url=spec.get('url'),
                    email=spec.get('email'))
            DBSession.add(common.ContributionContributor(
                ord=i + 1,
                primary=True,
                contributor=contrib,
                contribution=dictionary))

        submissions.append((dictionary.id, language.id, submission))
    transaction.commit()

    for did, lid, submission in submissions:
        #if submission.id != 'sidaama':
        #    continue
        transaction.begin()
        print('loading %s ...' % submission.id)
        dictdata = Data()
        lang = Variety.get(lid)
        submission.load_examples(Dictionary.get(did), dictdata, lang)
        submission.dictionary.load(
            submission,
            dictdata,
            Dictionary.get(did),
            lang,
            comparison_meanings,
            OrderedDict(submission.md.get('properties', {}).get('labels', [])))
        transaction.commit()
        print('... done')

    transaction.begin()
    load_families(
        Data(),
        [v for v in DBSession.query(Variety) if re.match('[a-z]{4}[0-9]{4}', v.id)],
        glottolog_repos='../../glottolog3/glottolog')
Esempio n. 24
0
def main(args):
    data = Data()

    dataset = common.Dataset(
        id=cobl2.__name__,
        name="IE-CoR",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="https://www.eva.mpg.de",
        license="https://creativecommons.org/licenses/by/4.0/",
        domain='iecor.clld.org',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})

    DBSession.add(dataset)

    editors = OrderedDict([('Heggarty', None), ('Anderson', None), ('Scarborough', None)])
    for row in sorted(ds['authors.csv'], key=lambda x: [
            x['Last_Name'].lower(), x['First_Name'].lower()]):
        if row['Last_Name'] in editors:
            editors[row['Last_Name']] = row['ID']
        data.add(
            models.Author,
            row['ID'],
            id=row['ID'],
            name='{0} {1}'.format(row['First_Name'], row['Last_Name']),
            url=row['URL'],
            photo=data_uri(photos[row['Last_Name']], 'image/jpg') if row['Last_Name'] in photos else None)

    for i, cid in enumerate(editors.values()):
        common.Editor(dataset=dataset, contributor=data['Author'][cid], ord=i + 1)

    for src in ds.sources.items():
        for invalid in ['isbn', 'part', 'institution']:
            if invalid in src:
                del src[invalid]
        data.add(
            common.Source,
            src.id,
            id=src.id,
            name=src.get('author', src.get('editor')),
            description=src.get('title', src.get('booktitle')),
            bibtex_type=getattr(EntryType, src.genre, EntryType.misc),
            **src)

    re_links = re.compile(r'\[(?P<label>[^\]]+?)\]\((?P<type>.+?)-(?P<id>\d+)\)')
    link_map = {
        'cog': '/cognatesets/',
        'lex': '/values/',
        'src': '/sources/',
    }

    def parse_links(m):
        try:
            return '<a href="{}{}">{}</a>'.format(
                link_map[m.group('type')], m.group('id'), m.group('label'))
        except KeyError:
            print("parse_links: type error in '{}'".format(":".join(m.groups())))
            return '[{}]({}-{})'.format(m.group('label'), m.group('type'), m.group('id'))

    for param in ds['ParameterTable']:
        data.add(
            models.Meaning,
            param['ID'],
            id=slug(param['Name']),
            name=param['Name'],
            description_md=param['Description_md'],
            concepticon_id=int(param['Concepticon_ID']) if param['Concepticon_ID'] != '0' else None,
        )

    for row in ds['clades.csv']:
        data.add(
            models.Clade,
            row['ID'],
            id=row['ID'],
            level0_name=row['level0_name'],
            level1_name=row['level1_name'],
            level2_name=row['level2_name'],
            level3_name=row['level3_name'],
            clade_level0=row['clade_level0'],
            clade_level1=row['clade_level1'],
            clade_level2=row['clade_level2'],
            clade_level3=row['clade_level3'],
            clade_name=row['clade_name'],
            short_name=row['short_name'],
            color=row['color'],
        )

    for row in ds['LanguageTable']:
        c = data.add(
            common.Contribution,
            row['ID'],
            id=row['ID'],
            name=row['Name'],
        )
        for i, cid in enumerate(row['Author_ID']):
            DBSession.add(common.ContributionContributor(
                contribution=c, contributor=data['Author'][cid], ord=i + 1))
        data.add(
            models.Variety,
            row['ID'],
            id=slug(row['Name']),
            name=row['Name'],
            latitude=float(row['Latitude']) if row['Latitude'] is not None else None,
            longitude=float(row['Longitude']) if row['Longitude'] is not None else None,
            contribution=c,
            color=rgb_as_hex(row['Color']),
            clade=', '.join(filter(None, row['Clade'])),
            clade_name=row['clade_name'],
            glottocode=row['Glottocode'],
            historical=row['historical'],
            distribution=row['distribution'],
            logNormalMean=row['logNormalMean'],
            logNormalOffset=row['logNormalOffset'],
            logNormalStDev=row['logNormalStDev'],
            normalMean=row['normalMean'],
            normalStDev=row['normalStDev'],
            ascii_name=row['ascii_name'],
            iso=row['ISO639P3code'],
            lang_description=row['Description'],
            variety=row['Variety'],
            loc_justification=row['loc_justification'] or None,
            sort_order=row['sort_order']
        )

    vsrs = set()
    for row in ds['FormTable']:
        vs = data['ValueSet'].get((row['Language_ID'], row['Parameter_ID']))
        if not vs:
            vs = data.add(
                common.ValueSet,
                (row['Language_ID'], row['Parameter_ID']),
                id='{0}-{1}'.format(row['Language_ID'], row['Parameter_ID']),
                language=data['Variety'][row['Language_ID']],
                parameter=data['Meaning'][row['Parameter_ID']],
                contribution=data['Contribution'][row['Language_ID']],
            )
        v = data.add(
            models.Lexeme,
            row['ID'],
            id=row['ID'],
            name=row['Form'],
            native_script=row['native_script'],
            phonetic=row['phon_form'],
            phonemic=row['Phonemic'],
            comment=re_links.sub(parse_links, row['Comment'] or ''),
            url=row['url'],
            gloss=row['Gloss'],
            valueset=vs
        )
        for src in row['Source']:
            sid, pages = ds.sources.parse(src)
            key = (vs.id, sid, pages)
            if pages:
                pages = pages.replace('|', ';')
            if key not in vsrs:
                DBSession.add(common.ValueSetReference(
                    valueset=vs, source=data['Source'][sid], description=pages))
                vsrs.add(key)

    for row in ds['CognatesetTable']:
        cc = data.add(
            models.CognateClass,
            row['ID'],
            id=row['ID'],
            name=row['ID'],
            root_form=row['Root_Form_calc'] if row['Root_Form_calc'] is not None and len(row['Root_Form_calc']) else row['Root_Form'],
            root_form_calc=row['Root_Form_calc'] or None,
            root_gloss=row['Root_Gloss'] or None,
            root_language=row['Root_Language_calc'] if row['Root_Language_calc'] is not None and len(row['Root_Language_calc']) else row['Root_Language'],
            root_language_calc=row['Root_Language_calc'] or None,
            comment=re_links.sub(parse_links, row['Comment'] or ''),
            justification=re_links.sub(parse_links, row['Justification'] or ''),
            ideophonic=row['Ideophonic'] or None,
            parallel_derivation=row['parallelDerivation'] or None,
            revised_by=','.join(row['revised_by']) or None,
            superset_id=int(row['supersetid']) if row['supersetid'] else None,
        )
        for src in row['Source']:
            sid, pages = ds.sources.parse(src)
            if pages:
                pages = pages.replace('|', ';')
            DBSession.add(clld_cognacy_plugin.models.CognatesetReference(
                cognateset=cc, source=data['Source'][sid], description=pages))

    DBSession.flush()

    cc_id_pk_map = {str(ccid): cc.pk for ccid, cc in data['CognateClass'].items()}
    for row in ds['CognatesetTable']:
        if row['proposedAsCognateTo_pk']:
            DBSession.add(models.ProposedCognates(
                cc1_pk=data['CognateClass'][row['ID']].pk,
                cc2_pk=cc_id_pk_map[str(row['proposedAsCognateTo_pk'])],
                scale=row['proposedAsCognateToScale']
            ))
    DBSession.flush()

    loans = {ln['Cognateset_ID']: ln for ln in ds['loans.csv']}
    for ccid, cc in data['CognateClass'].items():
        if ccid in loans:
            le = loans[ccid]
            if le['SourceCognateset_ID']:
                cc.loan_source_pk = data['CognateClass'][le['SourceCognateset_ID']].pk
            else:
                cc.loan_source_pk = None
            cc.loan_notes = le['Comment']
            cc.loan_source_languoid = le['Source_languoid']
            cc.loan_source_form = le['Source_form']
            cc.parallel_loan_event = le['Parallel_loan_event']
            cc.is_loan = True

    for row in ds['CognateTable']:
        cc = data['CognateClass'][row['Cognateset_ID']]
        if cc.meaning_pk is None:
            cc.meaning_pk = data['Lexeme'][row['Form_ID']].valueset.parameter_pk
        else:
            assert data['Lexeme'][row['Form_ID']].valueset.parameter_pk == cc.meaning_pk
        data.add(
            clld_cognacy_plugin.models.Cognate,
            row['ID'],
            cognateset=data['CognateClass'][row['Cognateset_ID']],
            counterpart=data['Lexeme'][row['Form_ID']],
            doubt=row['Doubt'],
        )

    l_by_gc = {}
    for s in DBSession.query(models.Variety):
        l_by_gc[s.glottocode] = s.pk

    tree = Phylogeny(
        id='1',
        name='Bouckaert et al.',
        description='',
        newick=Path.read_text(data_file_path / 'raw' / 'bouckaert_et_al2012' / 'newick.txt'),
    )
    for k, taxon in enumerate(reader(data_file_path / 'raw' / 'bouckaert_et_al2012' / 'taxa.csv', namedtuples=True)):
        label = TreeLabel(
            id='{0}-{1}-{2}'.format(tree.id, slug(taxon.taxon), k + 1),
            name=taxon.taxon,
            phylogeny=tree,
            description=taxon.glottocode)
        if taxon.glottocode in l_by_gc:
            LanguageTreeLabel(language_pk=l_by_gc[taxon.glottocode], treelabel=label)
    DBSession.add(tree)

    l_by_ascii = {}
    for s in DBSession.query(models.Variety):
        l_by_ascii[s.ascii_name] = s.pk

    tree = Phylogeny(
        id='2',
        name='CoBL consensu',
        description='',
        newick=Path.read_text(data_file_path / 'raw' / 'ie122' / 'newick.txt'),
    )
    for k, taxon in enumerate(reader(data_file_path / 'raw' / 'ie122' / 'taxa.csv', namedtuples=True)):
        label = TreeLabel(
            id='{0}-{1}-{2}'.format(tree.id, slug(taxon.taxon), k + 1),
            name=taxon.taxon,
            phylogeny=tree)
        if taxon.taxon in l_by_ascii:
            LanguageTreeLabel(language_pk=l_by_ascii[taxon.taxon], treelabel=label)
    DBSession.add(tree)
Esempio n. 25
0
def main(args):

    assert args.glottolog, 'The --glottolog option is required!'

    data = Data()
    ds = data.add(
        common.Dataset,
        jambu.__name__,
        id=jambu.__name__,
        name='Jambu',
        domain='jambu-clld.herokuapp.com',
        publisher_name="Georgetown University",
        publisher_place="Washington",
        publisher_url="http://gucl.georgetown.edu/",
        license="http://creativecommons.org/licenses/by/4.0/",
        jsondata={
            'license_icon':
            'cc-by.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License'
        },
    )

    for i, name in enumerate(['Aryaman Arora']):
        common.Editor(dataset=ds,
                      ord=i,
                      contributor=common.Contributor(id=slug(
                          HumanName(name).last),
                                                     name=name))

    contrib = data.add(
        common.Contribution,
        None,
        id='cldf',
        name=args.cldf.properties.get('dc:title'),
        description=args.cldf.properties.get('dc:bibliographicCitation'),
    )

    print("Languages...")
    for lang in iteritems(args.cldf, 'LanguageTable', 'id', 'name',
                          'glottocode', 'longitude', 'latitude', 'Clade'):
        data.add(
            models.Variety,
            lang['id'],
            id=lang['id'],
            name=lang['name'],
            latitude=lang['latitude'],
            longitude=lang['longitude'],
            glottocode=lang['glottocode'],
            family=lang['Clade'],
        )

    for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    refs = collections.defaultdict(list)

    print("Cognates...")
    for cognate in iteritems(args.cldf, 'CognateTable'):
        # print(cognate)
        data.add(models.Cognate_,
                 cognate['Cognateset_ID'],
                 name=cognate['Form'],
                 language=cognate['Language_ID'],
                 description=cognate['Description'])

    counts = collections.defaultdict(set)
    print("Forms...")
    for form in tqdm(
            iteritems(args.cldf, 'FormTable', 'id', 'form',
                      'languageReference', 'parameterReference', 'source')):
        counts[form['parameterReference']].add(form['languageReference'])

    print("Params...")
    for param in tqdm(
            iteritems(args.cldf, 'ParameterTable', 'ID', 'Name',
                      'Concepticon_ID', 'Description')):
        data.add(models.Concept,
                 param['ID'],
                 id=param['ID'],
                 name='{} [{}]'.format(param['Name'], param['ID']),
                 description=param['Description'],
                 count=len(counts[param['ID']]))

    print("Forms...")
    for form in tqdm(
            iteritems(args.cldf, 'FormTable', 'id', 'form',
                      'languageReference', 'parameterReference', 'source')):
        l = re.split(r";|\+", form['parameterReference'])
        for i, paramref in enumerate(l):
            if paramref == '?': continue
            vsid = (form['languageReference'], paramref)
            vs = data['ValueSet'].get(vsid)
            if not vs:
                vs = data.add(
                    common.ValueSet,
                    vsid,
                    id='-'.join(vsid),
                    language=data['Variety'][form['languageReference']],
                    parameter=data['Concept'][paramref],
                    contribution=contrib,
                )

            for ref in form.get('source', []):
                sid, pages = Sources.parse(ref)
                refs[(vsid, sid)].append(pages)

            data.add(
                models.Lexeme,
                form['id'] + '-' + str(i) if len(l) > 1 else form['id'],
                id=form['id'] + '-' + str(i) if len(l) > 1 else form['id'],
                name=form['form'],
                gloss=form['Gloss'],
                native=form['Native'],
                phonemic='/' + form['Phonemic'] +
                '/' if form['Phonemic'] else None,
                description=form['Description'],
                cognateset=form['Cognateset'],
                valueset=vs,
            )

    print("Refs...")
    for (vsid, sid), pages in tqdm(refs.items()):
        DBSession.add(
            common.ValueSetReference(valueset=data['ValueSet'][vsid],
                                     source=data['Source'][sid],
                                     description='; '.join(nfilter(pages))))
Esempio n. 26
0
def main(args):  # pragma: no cover
    data = Data()
    clts = CLTS(input('Path to cldf-clts/clts:') or '../../cldf-clts/clts')
    ds = data.add(
        common.Dataset,
        tppsr.__name__,
        id=tppsr.__name__,
        name='Tableaux phonétiques des patois suisses romands Online',
        domain='tppsr.clld.org',
        contact="*****@*****.**",
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="https://www.eva.mpg.de",
        license="https://creativecommons.org/licenses/by/4.0/",
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'},
    )
    for i, name in enumerate(['Hans Geisler', 'Robert Forkel', 'Johann-Mattis List']):
        common.Editor(
            dataset=ds,
            ord=i,
            contributor=common.Contributor(id=slug(HumanName(name).last), name=name)
        )

    contrib = data.add(
        common.Contribution,
        None,
        id='cldf',
        name=args.cldf.properties.get('dc:title'),
        description=args.cldf.properties.get('dc:bibliographicCitation'),
    )

    for lang in iteritems(args.cldf, 'LanguageTable', 'id', 'name', 'latitude', 'longitude'):
        data.add(
            models.Variety,
            lang['id'],
            id=lang['Number'],
            name=lang['name'],
            description=lang['FullName'],
            latitude=lang['latitude'],
            longitude=lang['longitude'],
            canton=lang['Canton'],
            group=lang['DialectGroup'],
            recorded=lang['DateOfRecording'],
            population=int(lang['Population']) if lang['Population'] else None,
            speaker_age=int(lang['SpeakerAge']) if lang['SpeakerAge'] else None,
            speaker_proficiency=lang['SpeakerProficiency'],
            speaker_language_use=lang['SpeakerLanguageUse'],
            speaker_gender=lang['SpeakerGender'],
            investigators=lang['Investigators'],
        )
    colors = qualitative_colors(len(set(l.canton for l in data['Variety'].values())), set='tol')
    for i, (_, langs) in enumerate(itertools.groupby(
        sorted(data['Variety'].values(), key=lambda l: l.canton),
        lambda l: l.canton,
    )):
        for lang in langs:
            lang.update_jsondata(color=colors[i])

    for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    refs = collections.defaultdict(list)
    for param in iteritems(args.cldf, 'ParameterTable', 'id', 'concepticonReference', 'name'):
        data.add(
            models.Concept,
            param['id'],
            id=param['Number'],
            number=int(param['Number']),
            name='{} [{}]'.format(param['name'], param['Number']),
            latin_gloss=param['Latin_Gloss'],
            french_gloss=param['French_Gloss'],
            concepticon_id=param['concepticonReference'],
            concepticon_gloss=param['Concepticon_Gloss'],
            concepticon_concept_id=param['id'].split('_')[0],
        )

    inventories = collections.defaultdict(set)
    scan_url_template = args.cldf['FormTable', 'Scan'].valueUrl
    for form in iteritems(args.cldf, 'FormTable', 'id', 'value', 'form', 'languageReference', 'parameterReference', 'source'):
        if not form['form']:
            continue
        inventories[form['languageReference']] = inventories[form['languageReference']].union(form['Segments'])
        vsid = (form['languageReference'], form['parameterReference'])
        vs = data['ValueSet'].get(vsid)
        if not vs:
            vs = data.add(
                common.ValueSet,
                vsid,
                id='-'.join(vsid),
                language=data['Variety'][form['languageReference']],
                parameter=data['Concept'][form['parameterReference']],
                contribution=contrib,
            )
        for ref in form.get('source', []):
            sid, pages = Sources.parse(ref)
            refs[(vsid, sid)].append(pages)
        f = data.add(
            models.Form,
            form['id'],  # Gauchat-1925-480-1_
            id=form['id'],
            name=form['form'].replace('+', ' '),
            description=form['value'],
            segments=' '.join(form['Segments']),
            valueset=vs,
            scan=scan_url_template.expand(**form),
            prosodic_structure=form['ProsodicStructure'],
        )

    for example in args.cldf['ExampleTable']:
        sentence = models.Phrase(
            id=example['ID'],
            language=data['Variety'][example['Language_ID']],
            name=example['Primary_Text'],
            description=example['Translated_Text'],
            original_script=example['Alt_Transcription'],
        )
        for cid in example['Concept_ID']:
            DBSession.add(models.ConceptSentence(concept=data['Concept'][cid], sentence=sentence))
        for fid in example['Form_ID']:
            DBSession.add(common.ValueSentence(value=data['Form'][fid], sentence=sentence))

    for lid, inv in inventories.items():
        inv = [clts.bipa[c] for c in inv]
        data['Variety'][lid].update_jsondata(
            inventory=[(str(c), c.name) for c in inv if hasattr(c, 'name')])

    for (vsid, sid), pages in refs.items():
        DBSession.add(common.ValueSetReference(
            valueset=data['ValueSet'][vsid],
            source=data['Source'][sid],
            description='; '.join(nfilter(pages))
        ))
Esempio n. 27
0
def main(args):  # pragma: no cover
    ds = StructureDataset.from_metadata(DS)
    data = Data()
    for source in ds.sources:
        data.add(common.Source, source.id, _obj=bibtex2source(source))

    ext = [
        Record.from_string('@' + s, lowercase=True)
        for s in nfilter(BIB.split('@'))
    ]
    for rec in ext:
        if rec.id not in data['Source']:
            data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    for contrib in ds['contributors.csv']:
        o = data.add(
            common.Contributor,
            contrib['ID'],
            id=contrib['ID'].upper(),
            name=contrib['Name'],
            description=contrib['Description'],
            url=contrib['URL'],
            jsondata={
                'readme': contrib['Readme'],
                'contents': contrib['Contents']
            },
        )
        for src in contrib['Source']:
            DBSession.add(
                models.ContributorReference(source=data['Source'][src],
                                            contributor=o))

    dataset = data.add(
        common.Dataset,
        'phoible',
        id='phoible',
        name='PHOIBLE 2.0',
        description='PHOIBLE 2.0',
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="https://www.shh.mpg.de",
        domain='phoible.org',
        license='https://creativecommons.org/licenses/by-sa/3.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon':
            'https://i.creativecommons.org/l/by-sa/3.0/88x31.png',
            'license_name':
            'Creative Commons Attribution-ShareAlike 3.0 Unported License'
        })

    for i, (cid, name) in enumerate([
        ('UZ', "Steven Moran"),
        ('mccloy', "Daniel McCloy"),
    ],
                                    start=1):
        contrib = data['Contributor'].get(cid)
        if not contrib:
            contrib = common.Contributor(id=cid, name=name)
        DBSession.add(
            common.Editor(dataset=dataset, ord=i, contributor=contrib))

    glottolog = Glottolog(
        Path(phoible.__file__).parent.parent.parent.parent.joinpath(
            'glottolog', 'glottolog'))

    for lang in ds['LanguageTable']:
        l = data.add(
            models.Variety,
            lang['ID'],
            id=lang['ID'],
            name=lang['Name'],
        )

    load_families(data, [(l.id, l)
                         for l in data['Variety'].values() if len(l.id) == 8],
                  glottolog.repos)
    DBSession.flush()

    # assign color codes:
    families = defaultdict(list)
    for l in data['Variety'].values():
        families[l.family_pk].append(l)

    colors = color.qualitative_colors(len(families))
    for i, langs in enumerate(sorted(families.values(),
                                     key=lambda v: -len(v))):
        for l in langs:
            l.jsondata = {'color': colors[i]}

    for segment in ds['ParameterTable']:
        equivalence_class = ''.join([
            t[0] for t in [(c, unicodedata.name(c)) for c in segment['Name']]
            if t[1].split()[0] not in ['COMBINING', 'MODIFIER']
        ]),
        data.add(models.Segment,
                 segment['ID'],
                 id=segment['ID'],
                 name=segment['Name'],
                 description=segment['Description'],
                 segment_class=segment['SegmentClass'],
                 equivalence_class=equivalence_class)
    DBSession.flush()

    # Add redirects for old language pages! get relevant ISO codes and map to Glottocode!
    for model, repls in load(
            Path(phoible.__file__).parent.parent /
            'replacements.json').items():
        if model == 'Language':
            languoids = {l.id: l for l in glottolog.languoids()}
            iso_languoids = {l.iso: l for l in languoids.values() if l.iso}
            gl_in_phoible = set(data['Variety'].keys())
            for oid, nid in repls.items():
                gls = descendants_from_nodemap(
                    iso_languoids.get(oid),
                    languoids).intersection(gl_in_phoible)
                if gls:
                    nid = gls.pop()
                    if len(gls) > 1:
                        print('+++', oid, gls)
                else:
                    print('---', oid)
                common.Config.add_replacement(oid, nid, common.Language)
        elif model == 'Parameter':
            segments_in_phoible = set(data['Segment'].keys())
            for oid, nid in repls.items():
                id_ = nid if nid in segments_in_phoible else None
                common.Config.add_replacement(oid, id_, common.Parameter)

    for segment in ds['ParameterTable']:
        for i, (k, v) in enumerate(sorted(segment.items())):
            if k not in ['ID', 'Name', 'Description', 'SegmentClass']:
                DBSession.add(
                    common.Parameter_data(
                        key=feature_name(k),
                        value=v,
                        ord=i,
                        object_pk=data['Segment'][segment['ID']].pk))

    for inventory in ds['contributions.csv']:
        inv = data.add(
            models.Inventory,
            inventory['ID'],
            id=inventory['ID'],
            name='{0} ({1} {2})'.format(
                inventory['Name'],
                inventory['Contributor_ID'].upper(),
                inventory['ID'],
            ),
            source_url=inventory['URL'],
            count_tone=inventory['count_tones'],
            count_vowel=inventory['count_vowels'],
            count_consonant=inventory['count_consonants'],
        )
        DBSession.add(
            common.ContributionContributor(
                contribution=inv,
                contributor=data['Contributor'][
                    inventory['Contributor_ID'].upper()]))
        for src in inventory['Source']:
            DBSession.add(
                common.ContributionReference(contribution=inv,
                                             source=data['Source'][src]))

    for phoneme in ds['ValueTable']:
        lang = data['Variety'][phoneme['Language_ID']]
        inv = data['Inventory'][phoneme['Contribution_ID']]
        if not inv.language:
            inv.language = lang
        vs = common.ValueSet(
            id=phoneme['ID'],
            contribution=inv,
            language=lang,
            parameter=data['Segment'][phoneme['Parameter_ID']])

        for ref in phoneme['Source']:
            DBSession.add(
                common.ValueSetReference(source=data['Source'][ref],
                                         valueset=vs))

        DBSession.add(
            models.Phoneme(
                id=phoneme['ID'],
                name='%s %s' %
                (phoneme['Value'],
                 data['Inventory'][phoneme['Contribution_ID']].name),
                allophones=' '.join(phoneme['Allophones']),
                marginal=phoneme['Marginal'],
                valueset=vs))

    return
Esempio n. 28
0
def main(args):

    Index('ducet', collkey(func.translate(common.Value.name, 'ˈ,ː,ˌ', '')))\
        .create(DBSession.bind)

    data = Data()

    dataset = common.Dataset(
        id=numerals.__name__,
        name="Numeralbank",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        domain="numerals.clld.org",
        jsondata={
            "license_icon": "cc-by.png",
            "license_name": "Creative Commons Attribution 4.0 International License",
        },
    )

    DBSession.add(dataset)

    for i, (id_, name) in enumerate(
        [("verkerkannemarie", "Annemarie Verkerk"), ("rzymskichristoph", "Christoph Rzymski")]
    ):
        ed = data.add(common.Contributor, id_, id=id_, name=name)
        common.Editor(dataset=dataset, contributor=ed, ord=i + 1)

    DBSession.add(dataset)

    # Take meta data from curated CLDF data set
    ds = Wordlist.from_metadata(data_repos[1]['data_path'] / 'cldf' / 'cldf-metadata.json')
    # Parameters:
    for parameter in ds["ParameterTable"]:
        data.add(
            models.NumberParameter,
            parameter["ID"],
            id=parameter["ID"],
            name="{0}".format(parameter["ID"]),
            concepticon_id=parameter['Concepticon_ID'],
        )
    basis_parameter = data.add(
        models.NumberParameter,
        "0",
        id="0",
        name="Base",
    )
    load_family_langs = []
    for language in ds["LanguageTable"]:
        lang = data.add(
            models.Variety,
            language["ID"],
            id=language["ID"],
            name=language["Name"],
            latitude=language["Latitude"],
            longitude=language["Longitude"],
            creator=language["Contributor"],
            comment=language["Comment"],
            url_soure_name=language["SourceFile"],
        )
        if language["Glottocode"]:
            load_family_langs.append((language["Glottocode"], lang))

    # get orginal forms
    ds = Wordlist.from_metadata(data_repos[0]['data_path'] / 'cldf' / 'cldf-metadata.json')
    org_forms = {f["ID"]: f for f in ds["FormTable"]}

    d = data_repos[1]
    contrib = data.add(
        common.Contribution,
        d['id'],
        id=d['id'],
        name=d['name']
    )

    # process curated forms
    ds = Wordlist.from_metadata(data_repos[1]['data_path'] / 'cldf' / 'cldf-metadata.json')

    # Add Base info if given
    for language in ds["LanguageTable"]:
        if language["Base"]:
            basis = language["Base"]
            de = data["DomainElement"].get(basis)
            if not de:
                de = data.add(
                    common.DomainElement,
                    basis,
                    id=text_type(basis),
                    name=text_type(basis),
                    parameter=basis_parameter,
                )
            vs = data.add(
                common.ValueSet,
                data["Variety"][language["ID"]].id,
                id=data["Variety"][language["ID"]].id,
                language=data["Variety"][language["ID"]],
                parameter=basis_parameter,
                contribution=contrib,
            )

            common.Value(
                id=data["Variety"][language["ID"]].id,
                valueset=vs,
                domainelement=de
            )

    # Forms:
    for form in ds["FormTable"]:
        valueset_id = "{0}-{1}".format(form["Parameter_ID"], form["Language_ID"])
        valueset = data["ValueSet"].get(valueset_id)

        # Unless we already have something in the VS:
        if not valueset:
            if form["Language_ID"] in data["Variety"]:
                vs = data.add(
                    common.ValueSet,
                    valueset_id,
                    id=valueset_id,
                    language=data["Variety"][form["Language_ID"]],
                    parameter=data["NumberParameter"][form["Parameter_ID"]],
                    contribution=contrib,
                )

        org_form = ""
        if form["ID"] in org_forms:
            if unicodedata.normalize('NFC', org_forms[form["ID"]]["Form"].strip()) != form["Form"]:
                org_form = org_forms[form["ID"]]["Form"]
        else:
            org_form = "no original form"
        DBSession.add(
            models.NumberLexeme(
                id=form["ID"],
                name=form["Form"],
                comment=form["Comment"],
                is_loan=form["Loan"],
                other_form=form["Other_Form"],
                org_form=org_form,
                is_problematic=form["Problematic"],
                valueset=vs,
            )
        )

    load_families(
        Data(),
        load_family_langs,
        glottolog_repos=gl_repos,
        strict=False,
    )

    distinct_varieties = DBSession.query(models.Variety.family_pk).distinct().all()
    families = dict(
        zip([r[0] for r in distinct_varieties], color.qualitative_colors(len(distinct_varieties)))
    )

    for l in DBSession.query(models.Variety):
        l.jsondata = {"color": families[l.family_pk]}

    p = common.Parameter.get("0")
    colors = color.qualitative_colors(len(p.domain))

    for i, de in enumerate(p.domain):
        de.jsondata = {"color": colors[i]}
Esempio n. 29
0
def main(args):
    data = Data()
    data_path = lambda *cs: args.data_file('concepticon-data',
                                           'concepticondata', *cs)

    dataset = common.Dataset(
        id=concepticon.__name__,
        name="Concepticon 1.0",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        contact='*****@*****.**',
        domain='concepticon.clld.org',
        jsondata={
            'license_icon':
            'cc-by.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License'
        })
    DBSession.add(dataset)
    for i, name in enumerate(
        ['Johann-Mattis List', 'Michael Cysouw', 'Robert Forkel']):
        c = common.Contributor(id=slug(name), name=name)
        dataset.editors.append(common.Editor(contributor=c, ord=i))

    english = data.add(common.Language, 'eng', id='eng', name='English')

    files = {}
    for fname in data_path('sources').iterdir():
        files[fname.stem] = \
            "https://github.com/clld/concepticon-data/blob/master/concepticondata/sources/%s" % fname.name

    for rec in Database.from_file(data_path('references', 'references.bib'),
                                  lowercase=True):
        source = data.add(common.Source, rec.id, _obj=bibtex2source(rec))
        if rec.id in files:
            DBSession.flush()
            DBSession.add(
                common.Source_files(mime_type='application/pdf',
                                    object_pk=source.pk,
                                    jsondata=dict(url=files[rec.id])))

    for concept in reader(data_path('concepticon.tsv'), namedtuples=True):
        data.add(models.ConceptSet,
                 concept.ID,
                 id=concept.ID,
                 name=concept.GLOSS,
                 description=concept.DEFINITION,
                 semanticfield=concept.SEMANTICFIELD,
                 ontological_category=concept.ONTOLOGICAL_CATEGORY)

    for rel in reader(data_path('conceptrelations.tsv'), namedtuples=True):
        DBSession.add(
            models.Relation(source=data['ConceptSet'][rel.SOURCE],
                            target=data['ConceptSet'][rel.TARGET],
                            description=rel.RELATION))

    unmapped = Counter()
    number_pattern = re.compile('(?P<number>[0-9]+)(?P<suffix>.*)')

    for cl in reader(data_path('conceptlists.tsv'), dicts=True):
        concepts = data_path('conceptlists', '%(ID)s.tsv' % cl)
        if not concepts.exists():
            continue
        langs = [l.lower() for l in split(cl['SOURCE_LANGUAGE'])]
        conceptlist = data.add(
            models.Conceptlist,
            cl['ID'],
            id=cl['ID'],
            name=' '.join(cl['ID'].split('-')),
            description=cl['NOTE'],
            target_languages=cl['TARGET_LANGUAGE'],
            source_languages=' '.join(langs),
            year=int(cl['YEAR']) if cl['YEAR'] else None,
        )
        for id_ in split(cl['REFS']):
            common.ContributionReference(source=data['Source'][id_],
                                         contribution=conceptlist)
        for i, name in enumerate(split(cl['AUTHOR'], sep=' and ')):
            name = strip_braces(name)
            contrib = data['Contributor'].get(name)
            if not contrib:
                contrib = data.add(common.Contributor,
                                   name,
                                   id=slug(name),
                                   name=name)
            DBSession.add(
                common.ContributionContributor(ord=i,
                                               contribution=conceptlist,
                                               contributor=contrib))
        for k in 'ID NOTE TARGET_LANGUAGE SOURCE_LANGUAGE YEAR REFS AUTHOR'.split(
        ):
            del cl[k]
        DBSession.flush()
        for k, v in cl.items():
            DBSession.add(
                common.Contribution_data(object_pk=conceptlist.pk,
                                         key=k,
                                         value=v))

        for concept in reader(concepts, namedtuples=True):
            if not concept.ID or not concept.CONCEPTICON_ID or concept.CONCEPTICON_ID == 'NAN':
                #print conceptlist.id, getattr(concept, 'ENGLISH', getattr(concept, 'GLOSS', None))
                unmapped.update([conceptlist.id])
                continue

            lgs = {}
            for lang in langs:
                v = getattr(concept, lang.upper())
                if v:
                    lgs[lang] = v

            match = number_pattern.match(concept.NUMBER)
            if not match:
                print(concept.ID)
                raise ValueError
            vs = common.ValueSet(
                id=concept.ID,
                description=getattr(concept, 'GLOSS',
                                    getattr(concept, 'ENGLISH', None)),
                language=english,
                contribution=conceptlist,
                parameter=data['ConceptSet'][concept.CONCEPTICON_ID])
            d = {}
            for key, value in concept.__dict__.items():
                if not key.startswith('CONCEPTICON_') and \
                        key not in ['NUMBER', 'ID', 'GLOSS'] + [l.upper() for l in langs]:
                    d[key.lower()] = value
            v = models.Concept(
                id=concept.ID,
                valueset=vs,
                description=getattr(concept, 'GLOSS',
                                    None),  # our own gloss, if available
                name='; '.join('%s [%s]' % (lgs[l], l)
                               for l in sorted(lgs.keys())),
                number=int(match.group('number')),
                number_suffix=match.group('suffix'),
                jsondata=d)
            DBSession.flush()
            for key, value in lgs.items():
                DBSession.add(
                    common.Value_data(key='lang_' + key,
                                      value=value,
                                      object_pk=v.pk))

    print('Unmapped concepts:')
    for clid, no in unmapped.most_common():
        print(clid, no)

    for fname in data_path('concept_set_meta').iterdir():
        if fname.suffix == '.tsv':
            md = load(fname.parent.joinpath(fname.name + '-metadata.json'))
            provider = models.MetaProvider(id=fname.stem,
                                           name=md['dc:title'],
                                           description=md['dc:description'],
                                           url=md['dc:source'],
                                           jsondata=md)
            for meta in reader(fname, dicts=True):
                try:
                    for k, v in meta.items():
                        if v and k != 'CONCEPTICON_ID':
                            models.ConceptSetMeta(metaprovider=provider,
                                                  conceptset=data['ConceptSet']
                                                  [meta['CONCEPTICON_ID']],
                                                  key=k,
                                                  value=v)
                except:
                    print(fname)
                    print(meta)
                    raise
Esempio n. 30
0
def main(args):
    data = Data()
    print(args.data_file('x'))

    dataset = common.Dataset(
        id=grammaticon.__name__,
        name="Grammaticon",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        domain='grammaticon.clld.org',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})
    DBSession.add(dataset)
    for i, ed in enumerate(['Martin Haspelmath', 'Robert Forkel']):
        common.Editor(dataset=dataset, contributor=get_contributor(data, ed), ord=i + 1)

    eng = data.add(common.Language, 'eng', name='English')

    for obj in reader(args.data_file('Feature_lists.csv'), dicts=True):
        contrib = data.add(
            models.Featurelist, obj['id'],
            id=slug(obj['name']),
            name=obj['name'],
            year=obj['year'],
            number_of_features=int(obj['number of features']) if obj['number of features'] else None,
            url=obj['year'])
        if obj['authors']:
            for i, author in enumerate(obj['authors'].split(',')):
                common.ContributionContributor(
                    contribution=contrib,
                    contributor=get_contributor(data, author),
                    ord=i + 1)

    #id,name,feature_area
    for name, objs in itertools.groupby(
            sorted(reader(args.data_file('Metafeatures.csv'), dicts=True), key=lambda i: i['name']),
            lambda i: i['name']):
        dbobj = None
        for obj in objs:
            if not dbobj:
                dbobj = data.add(
                    models.Metafeature, obj['id'],
                    id=slug(obj['id']), name=obj['name'], area=obj['feature_area'])
            else:
                data['Metafeature'][obj['id']] = dbobj

    DBSession.flush()
    #feature_ID,feature name,feature description,meta_feature_id,collection_id,collection URL,collection numbers
    for obj in reader(args.data_file('Features.csv'), dicts=True):
        if int(obj['collection_id']) == 8:
            obj['collection_id'] = '1'
        if (not obj['meta_feature_id']):  #or obj['meta_feature_id'] in ('89'):
            print('skipping: {}'.format(obj))
            continue
        vsid = (data['Featurelist'][obj['collection_id']].pk, data['Metafeature'][obj['meta_feature_id']].pk)
        vs = data['ValueSet'].get(vsid)
        if not vs:
            vs = data.add(
                common.ValueSet, vsid,
                id='{0}-{1}'.format(*vsid),
                contribution=data['Featurelist'][obj['collection_id']],
                parameter=data['Metafeature'][obj['meta_feature_id']],
                language=eng)
        models.Feature(
            valueset=vs, id=slug(obj['feature_ID']), name=obj['feature name'], description=obj['feature description'])

    for obj in reader(args.data_file('Concepts.csv'), dicts=True):
        data.add(
            models.Concept, obj['id'],
            id=obj.pop('id'), name=obj.pop('label'), description=obj.pop('definition'),
            **{k.replace(' ', '_'): v for k, v in obj.items()})

    for obj in reader(args.data_file('Concepts_metafeatures.csv'), dicts=True):
        if obj['meta_feature__id'] in ('89',):
            print('skipping: {}'.format(obj))
            continue
        if obj['concept_id'] and obj['meta_feature__id']:
            models.ConceptMetafeature(
                concept=data['Concept'][obj['concept_id']],
                metafeature=data['Metafeature'][obj['meta_feature__id']])

    for obj in reader(args.data_file('Concepthierarchy.csv'), dicts=True):
        child = data['Concept'].get(obj['concept_id'])
        if child:
            parent = data['Concept'].get(obj['concept_parent_id'])
            if parent:
                DBSession.add(models.ConceptRelation(parent=parent, child=child))