Ejemplo n.º 1
0
def get_clf_paths(lgs):
    glottolog = Glottolog()
    for lg in lgs:
        l = glottolog.languoid(lg)
        ancestors = [l.id]
        while l.parent:
            ancestors.insert(0, l.parent.id)
            l = l.parent
        yield tuple(ancestors)
Ejemplo n.º 2
0
    def test_Glottolog(self):
        from clldclient.glottolog import Glottolog

        with patch('clldclient.database.Cache', new=lambda: MockCache('glottolog_')):
            gl = Glottolog()
            gl.languoid('deu')
            deu = gl.languoid('http://glottolog.org/resource/languoid/id/stan1295')
            self.assertAlmostEquals(deu.longitude, 12.4676)
            assert deu.latitude
            self.assertEquals(deu.family.name, 'Indo-European')
            self.assertEquals(deu.parent.name, 'High Franconian')
            self.assertEquals(len(list(deu.children)), 3)
            self.assertEquals(deu.macroareas, ['Eurasia'])
Ejemplo n.º 3
0
def load_families(data,
                  languages,
                  glottolog=None,
                  icons=ORDERED_ICONS,
                  isolates_icon=ISOLATES_ICON):
    """Add Family objects to a database and update Language object from Glottolog.

    Family information is retrieved from Glottolog based on the id attribute of a
    language. This id must be either a glottocode or an ISO 639-3 code.

    :param data:
    :return:
    """
    icons = cycle([
        getattr(i, 'name', i) for i in icons
        if getattr(i, 'name', i) != isolates_icon
    ])
    glottolog = glottolog or Glottolog()

    for language in languages:
        if isinstance(language, (tuple, list)) and len(language) == 2:
            code, language = language
        else:
            code = language.id
        if code != '-':
            gl_language = glottolog.languoid(code)
            if gl_language:
                gl_family = gl_language.family
                if gl_family:
                    family = data['Family'].get(gl_family.id)
                    #print family
                    if not family:
                        family = data.add(
                            Family,
                            gl_family.id,
                            id=gl_family.id,
                            name=gl_family.name,
                            description=Identifier(
                                name=gl_family.id,
                                type=IdentifierType.glottolog.value).url(),
                            jsondata=dict(icon=next(icons)))
                    language.family = family

                language.macroarea = gl_language.macroareas[0]
                add_language_codes(data,
                                   language,
                                   gl_language.iso_code,
                                   glottocode=gl_language.id)
                for attr in 'latitude', 'longitude', 'name':
                    if getattr(language, attr) is None:
                        setattr(language, attr, getattr(gl_language, attr))
            else:
                language.macroarea = None
Ejemplo n.º 4
0
def import_dataset(path, data, icons):
    # look for metadata
    # look for sources
    # then loop over values
    dirpath, fname = os.path.split(path)
    basename, ext = os.path.splitext(fname)
    glottolog = Glottolog()

    contrib = Contribution(id=basename, name=basename)

    md = {}
    mdpath = path + '-metadata.json'
    if os.path.exists(mdpath):
        md = jsonload(mdpath)
    contributor_name = HumanName(md.get('contributed_datapoint', 'Team NTS'))
    contributor_id = slug(contributor_name.last + contributor_name.first)
    contributor = data['Contributor'].get(contributor_id)
    if not contributor:
        contributor = data.add(
            Contributor,
            contributor_id,
            id=contributor_id,
            name='%s' % contributor_name)
    DBSession.add(ContributionContributor(contribution=contrib, contributor=contributor))

    bibpath = os.path.join(dirpath, basename + '.bib')
    if os.path.exists(bibpath):
        for rec in Database.from_file(bibpath):
            if rec['key'] not in data['Source']:
                data.add(Source, rec['key'], _obj=bibtex2source(rec))

    languages = {f['properties']['glottocode']: f for f in md.get('features', [])}

    for i, row in enumerate(reader(path, dicts=True, quoting=csv.QUOTE_NONE, delimiter=',' if 'c' in ext else '\t')):
        if not row['Value'] or not row['Feature_ID']:
            continue
        vsid = '%s-%s-%s' % (basename, row['Language_ID'], row['Feature_ID'])
        vid = row.get('ID', '%s-%s' % (basename, i + 1))

        parameter = data['Feature'].get(row['Feature_ID'])
        if parameter is None:
            print('skip value for invalid feature %s' % row['Feature_ID'])
            continue
            #parameter = data.add(
            #    Feature, row['Feature_ID'], id=row['Feature_ID'], name=row.get('Feature', row['Feature_ID']))

        language = data['GrambankLanguage'].get(row['Language_ID'])
        if language is None:
            # query glottolog!
            languoid = glottolog.languoid(row['Language_ID'])
            gl_md = {
                'name': languoid.name,
                'longitude': languoid.longitude,
                'latitude': languoid.latitude}
            lmd = languages.get(row['Language_ID'])
            if lmd:
                if lmd.get('properties', {}).get('name'):
                    gl_md['name'] = lmd['properties']['name']
                if lmd.get('geometry', {}).get('coordinates'):
                    gl_md['longitude'], gl_md['latitude'] = lmd['geometry']['coordinates']

            language = data.add(
                GrambankLanguage, row['Language_ID'],
                id=row['Language_ID'],
                name=gl_md['name'],
                latitude=gl_md.get('latitude'),
                longitude=gl_md.get('longitude'))

        vs = data['ValueSet'].get(vsid)
        if vs is None:
            vs = data.add(
                ValueSet, vsid,
                id=vsid,
                parameter=parameter,
                language=language,
                contribution=contrib,
                source=row['Source'])

        domain = {de.abbr: de for de in parameter.domain}
        name = row['Value']
        if name in domain:
            name = domain[name].name

        Value(
            id=vid,
            valueset=vs,
            name=name,
            description=row['Comment'],
            domainelement=domain.get(row['Value']))

        for key, src in data['Source'].items():
            if key in vs.source:
                ValueSetReference(valueset=vs, source=src, key=key)
Ejemplo n.º 5
0
def import_dataset(path, data, icons, add_missing_features = False):
    # look for metadata
    # look for sources
    # then loop over values
    
    dirpath, fname = os.path.split(path)
    basename, ext = os.path.splitext(fname)
    glottolog = Glottolog()

    try:
        contrib = CulturebankContribution(id=basename, name=basename, desc=glottolog.languoid(basename).name)
    except:
        print("Basename {:s} did not match a glottolog languoid, skipped.".format(basename))
        return

    md = {}
    mdpath = path + '-metadata.json'
    if os.path.exists(mdpath):
        md = jsonload(mdpath)
    contributor_name = HumanName(md.get('contributed_datapoint', 'Team NTS'))
    contributor_id = slug(contributor_name.last + contributor_name.first)
    contributor = data['Contributor'].get(contributor_id)
    if not contributor:
        contributor = data.add(
            Contributor,
            contributor_id,
            id=contributor_id,
            name='%s' % contributor_name)
    DBSession.add(ContributionContributor(contribution=contrib, contributor=contributor))

    bibpath = os.path.join(dirpath, basename + '.bib')
    if os.path.exists(bibpath):
        for rec in Database.from_file(bibpath):
            if rec['key'] not in data['Source']:
                data.add(Source, rec['key'], _obj=bibtex2source(rec))

    languages = {f['properties']['glottocode']: f for f in md.get('features', [])}

    for i, row in pandas.io.parsers.read_csv(
            path,
            sep=',' if 'c' in ext else '\t',
            encoding='utf-16').iterrows():
        if pandas.isnull(row['Value']) or pandas.isnull(row['Feature_ID']):
            print("Expected columns not found: ", row)
            continue
        vsid = '%s-%s-%s' % (basename, row['Language_ID'], row['Feature_ID'])
        vid = row.get('ID', '%s-%s' % (basename, i + 1))

        parameter = data['Feature'].get(row['Feature_ID'])
        if parameter is None:
            if add_missing_features:
                parameter = data.add(Feature, row['Feature_ID'], id=row['Feature_ID'], name=row.get('Feature', row['Feature_ID']))
            else: 
                print(('skip value for invalid feature %s' % row['Feature_ID']))
                continue

        language = data['CulturebankLanguage'].get(row['Language_ID'])
        if language is None:
            # query glottolog!
            try:
                languoid = glottolog.languoid(row['Language_ID'])
            except AttributeError:
                print(('Skipping, no Glottocode found for %s' % row['Language_ID']))
                continue
            
            gl_md = {
                'name': languoid.name,
                'longitude': languoid.longitude,
                'latitude': languoid.latitude}
            lmd = languages.get(row['Language_ID'])
            if lmd:
                if lmd.get('properties', {}).get('name'):
                    gl_md['name'] = lmd['properties']['name']
                if lmd.get('geometry', {}).get('coordinates'):
                    gl_md['longitude'], gl_md['latitude'] = lmd['geometry']['coordinates']

            language = data.add(
                CulturebankLanguage, row['Language_ID'],
                id=row['Language_ID'],
                name=gl_md['name'],
                latitude=gl_md.get('latitude'),
                longitude=gl_md.get('longitude'))

        
        vs = data['ValueSet'].get(vsid)
        if vs is None:
            vs = data.add(
                ValueSet, vsid,
                id=vsid,
                parameter=parameter,
                language=language,
                contribution=contrib,
                source=row['Source'])

        domain = {de.abbr: de for de in parameter.domain}    
        name = row['Value']
        if name in domain:
            name = domain[name].name
        else:
            name = str(name)
            if name in domain:
                name = domain[name].name
            else:
                raise ValueError("For feature {:s} in language {:s}: Name {:s} not found among domain values {:}".format(
                    row['Language_ID'],
                    row['Feature_ID'],
                    name,
                    {d: de for d, de in domain.items()}))

        data.add(Value,
            vid,
            id=vid,
            valueset=vs,
            name=name,
            description=row['Comment'],
            domainelement=domain.get(row['Value']))

        print(".", end="")
        if vs.source is not None:
            for key, src in list(data['Source'].items()):
                if key in vs.source:
                    ValueSetReference(valueset=vs, source=src, key=key)
Ejemplo n.º 6
0
def load_families(data,
                  languages,
                  glottolog=None,
                  icons=ORDERED_ICONS,
                  isolates_icon=ISOLATES_ICON):
    """Add Family objects to a database and update Language object from Glottolog.

    Family information is retrieved from Glottolog based on the id attribute of a
    language. This id must be either a glottocode or an ISO 639-3 code.

    :param data:
    :return:
    """
    icons = cycle([
        getattr(i, 'name', i) for i in icons
        if getattr(i, 'name', i) != isolates_icon
    ])
    glottolog = glottolog or Glottolog()
    print len(languages), languages

    for language in languages:
        if isinstance(language, (tuple, list)) and len(language) == 2:
            code, language = language
        else:
            code = language.id
        print language, code
        if code != '-':
            gl_language = glottolog.languoid(code)
            if gl_language:
                gl_family = gl_language.family
                if gl_family and gl_family.name in [
                        'Sino-Tibetan', 'Dravidian', 'Indo-European',
                        'Austroasiatic'
                ]:  # the second condition is added by me (shafqat)
                    family = data['Family'].get(gl_family.id)
                    #print 'this one'
                    #print gl_family.name
                    if not family:
                        family = data.add(
                            Family,
                            gl_family.id,
                            id=gl_family.id,
                            name=gl_family.name,
                            description=Identifier(
                                name=gl_family.id,
                                type=IdentifierType.glottolog.value).url(),
                            ##jsondata=dict(icon=next(icons)))
                            jsondata=dict(icon=custom_icons[gl_family.name])
                        )  ## based on family, we can use different icons if we like as needed in case of LSI
                    language.family = family

                language.macroarea = gl_language.macroareas[0]
                add_language_codes(data,
                                   language,
                                   gl_language.iso_code,
                                   glottocode=gl_language.id)
                for attr in 'latitude', 'longitude', 'name':
                    if getattr(language, attr) is None:
                        setattr(language, attr, getattr(gl_language, attr))
            else:
                language.macroarea = None
Ejemplo n.º 7
0
def main(args):
    data = Data(created=utc.localize(datetime(2013, 11, 15)),
                updated=utc.localize(datetime(2013, 12, 12)))
    icons = issues.Icons()
    #print icons

    DBSession.execute("delete from Language")
    DBSession.execute("delete from Unit")
    DBSession.execute("delete from featuredomain")
    DBSession.execute("delete from family")
    DBSession.execute("delete from source")
    DBSession.execute("delete from parameter")
    DBSession.execute("delete from feature")
    DBSession.execute("delete from domainelement")
    DBSession.execute("delete from valueset")
    DBSession.execute("delete from value")
    DBSession.execute("delete from lsivalue")
    DBSession.execute("delete from dataset")
    DBSession.execute("delete from contributor")
    DBSession.execute("delete from lsilanguage")
    DBSession.execute("delete from contribution")
    DBSession.execute("delete from designer")

    DBSession.flush()

    dtab = partial(_dtab, args.data_file())

    #Languages

    #print args.data_file()
    #tabfns = ['%s' % fn.basename() for fn in args.data_file().files('nts_*.tab')]
    #tabfns = ['nts_18.tab']
    ##tabfns = os.listdir('/Users/virk/shafqat/postDoc-Swe/project/clld/clld/lsi/lsi/data')[1:]
    tabfns = os.listdir(
        '/Users/virk/shafqat/postDoc-Swe/project/clld/clld/lsi/lsi/lsi_data'
    )[1:]
    #print tabfns
    args.log.info("Sheets found: %s" % tabfns)
    ldps = []
    lgs = {}
    nfeatures = Counter()
    nlgs = Counter()

    for fn in tabfns:
        for ld in dtab(fn):

            if ld['language_id'] == 'qgr' or ld['language_id'] == '---' or ld[
                    'language_id'] == '':  # to exclude languages which do not have an iso-code
                continue
            if "feature_alphanumid" not in ld:
                args.log.info("NO FEATUREID %s %s" % (len(ld), ld))
            if not ld["feature_alphanumid"].startswith("DRS") \
                    and ld["feature_alphanumid"].find(".") == -1:
                ldps.append(dp_dict(ld))
                ##print ld
                lgs[ld['language_id']] = unescape(ld['language_name'])
                if ld["value"] != "?":
                    nfeatures.update([ld['language_id']])
                    nlgs.update([ld['feature_alphanumid']])

    ldps = sorted(ldps, key=lambda d: d['feature_alphanumid'])

    #lgs["ygr"] = "Hua"

    for lgid, lgname in lgs.items():
        data.add(models.lsiLanguage,
                 lgid,
                 id=lgid,
                 name=lgname,
                 representation=nfeatures.get(lgid, 0))
    DBSession.flush()
    #print "I am here"
    #print data['ntsLanguage'].values()[1].id
    load_families(
        data,
        ##[(NOCODE_TO_GLOTTOCODE.get(l.id, l.id), l) for l in data['lsiLanguage'].values()],
        [(NOCODE_TO_GLOTTOCODE.get(l.id, l.id), l)
         for l in data['lsiLanguage'].values()
         if l.id != '---' and l.id != ''],
        isolates_icon='tcccccc')
    #print 'family'
    #print data['Family'].get('sino1245').jsondata
    #Domains
    for domain in set(ld['feature_domain'] for ld in ldps):
        data.add(models.FeatureDomain, domain, name=domain)
    DBSession.flush()

    #Designers
    #for i, info in enumerate(dtab("ntscontributions.tab") + dtab("ntscontacts.tab")):
    for i, info in enumerate([{
            'designer': 'shafqat',
            'domain': '',
            'pdflink': '',
            'citation': ''
    }, {
            'designer': '-',
            'domain': '',
            'pdflink': '',
            'citation': ''
    }]):
        designer_id = str(i + 1)
        data.add(models.Designer,
                 info['designer'],
                 id=designer_id,
                 name=designer_id,
                 domain=info["domain"],
                 contributor=info['designer'],
                 pdflink=info["pdflink"],
                 citation=info["citation"])
    DBSession.flush()

    #Sources
    '''for k, (typ, bibdata) in [
        ktfbib(bibsource) for ld in ldps
        if ld.get(u'bibsources') for bibsource in ld['bibsources'].split(",,,")
    ]:
        if k not in data["Source"]:
            data.add(common.Source, k, _obj=bibtex2source(Record(typ, k, **bibdata)))
    DBSession.flush()'''

    #Features
    fs = [(fid, mergeds(lds))
          for fid, lds in groupby(ldps, key=lambda d: d['feature_alphanumid'])]

    fvdesc = [(fid, [
        (ld.get("feature_possible_values"), ld.get("fromfile")) for ld in lds
        if ld.get("feature_possible_values")
    ]) for fid, lds in groupby(ldps, key=lambda d: d['feature_alphanumid'])]
    fvdt = [(fid, grp2(vdescs)) for (fid, vdescs) in fvdesc]
    fvmis = [(fid, vdescs) for (fid, vdescs) in fvdt if len(vdescs) > 1]

    for _, dfsids in groupby(sorted(
        (f.get('feature_name', fid), fid) for fid, f in fs),
                             key=lambda t: t[0]):
        ##print [(k,v) for (k,v) in list(dfsids)],len(list(dfsids))
        assert len(list(dfsids)) == 1
    #print 'here is nlgs'

    for fid, f in fs:
        #print "lang name"
        #print ldps
        #print f.get('feature_possible_values', ""),
        if not fid.isdigit():
            args.log.info("NO INT FID %s" % f)
        feature = data.add(
            models.Feature,
            fid,
            id=fid,
            name=f.get('feature_name', f['feature_alphanumid']),
            doc=f.get('feature_information', ""),
            vdoc=f.get('feature_possible_values', ""),
            representation=nlgs.get(fid, 0),
            designer=data["Designer"][f['designer']],
            dependson=f.get("depends_on", ""),
            abbreviation=f.get("abbreviation", ""),
            featuredomain=data['FeatureDomain'][f["feature_domain"]],
            name_french=f.get('francais', ""),
            clarification=f.get(
                "draft of clarifying comments to outsiders (hedvig + dunn + harald + suzanne)",
                ""),
            alternative_id=f.get("old feature number", ""),
            jl_relevant_unit=f.get("relevant unit(s)", ""),
            jl_function=f.get("function", ""),
            jl_formal_means=f.get("formal means", ""),
            sortkey_str="",
            sortkey_int=int(fid))

        vdesclist = [veq.split("==") for veq in feature.vdoc.split("||")]
        vdesc = {v.replace(".", "-"): desc for [v, desc] in vdesclist}
        ##vdesc = {fmly+val2icon(v): desc for ((v,desc),fmly) in itertools.product([(vv,desc) for [vv, desc] in vdesclist],['c','d','f','t'])}

        vdesc.setdefault('?', 'Not known')
        if 'N/A' not in vdesc and feature.dependson:
            vdesc["N/A"] = "Not Applicable"
        vi = {v: i for (i, v) in enumerate(sorted(vdesc.keys()))}
        ##vicons = {f+val2icon(v):f+val2icon(v) for (v,f) in itertools.product(['0','1','2','3'],['c','d','f','t'])}
        ##vicons['?'] = 'c00ffff'
        ##vicons['N/A'] = 'c00ffff'
        ##vicons = icons.iconize(vi.keys())
        for (v, desc) in vdesc.items():
            #print v,vicons[v]
            data.add(common.DomainElement, (fid, v),
                     id='%s-%s' % (fid, v),
                     name=v,
                     description=desc,
                     jsondata={"icon": Colors[v]},
                     number=vi[v],
                     parameter=feature)
    DBSession.flush()

    for ((f, lg), ixs) in grp2([((ld['feature_alphanumid'], ld['language_id']),
                                 i) for i, ld in enumerate(ldps)]):
        ixvs = set([ldps[ix]['value'] for ix in ixs])
        if len(ixvs) == 1:
            continue
        args.log.warn("Dup value %s %s %s" %
                      (f, lg, [(ldps[ix]['value'], ldps[ix]['fromfile'])
                               for ix in ixs]))
        ##print "Dup value %s %s %s" % (f, lg, [(ldps[ix]['value'], ldps[ix]['fromfile'], ldps[ix].get('provenance')) for ix in ixs])
    errors = {}
    done = set()
    glottolog = Glottolog()
    for ld in ldps:

        ############################### for printing different map markers for different familys for features:shafqat
        #print data['Family']

        language = data['lsiLanguage'][ld['language_id']]
        if isinstance(language, (tuple, list)) and len(language) == 2:
            code, language = language
        else:
            code = language.id
        if code != '-':
            gl_language = glottolog.languoid(code)
            if gl_language:
                gl_family = gl_language.family
                if gl_family:
                    family = data['Family'].get(gl_family.id)

        ##ld['value'] = ld['value']+'-'+str(family)
        ##ld['value'] = combineValueFamily(ld['value'],str(family))
        #print family
        #####################################
        parameter = data['Feature'][ld['feature_alphanumid']]
        language = data['lsiLanguage'][ld['language_id']]

        id_ = '%s-%s' % (parameter.id, language.id)
        if id_ in done:
            continue

        if (ld['feature_alphanumid'],
                ld['value']) not in data['DomainElement']:
            if not ld["value"].strip():
                continue
            info = (ld['feature_alphanumid'],
                    ld.get('feature_name', "[Feature Name Lacking]"),
                    ld['language_id'], ld['value'], ld['fromfile'])
            msg = u"%s %s %s %s %s not in the set of legal values ({0})" % info
            args.log.error(
                msg.format(
                    sorted([
                        y for (x, y) in data['DomainElement'].keys()
                        if x == ld['feature_alphanumid']
                    ])))
            ##print msg.format(sorted(
            ##  [y for (x, y) in data['DomainElement'].keys()
            ## if x == ld['feature_alphanumid']]))
            errors[(ld['feature_alphanumid'], ld['language_id'])] = info
            continue

        vs = common.ValueSet(
            id=id_,
            language=language,
            parameter=parameter,
            source=ld["source"] or None,
            ##contribution=parameter.designer
        )
        #print
        #print "this one"
        #print ld['value'],family
        models.lsiValue(
            id=id_,
            domainelement=data['DomainElement'][(ld['feature_alphanumid'],
                                                 ld['value'])],
            jsondata={
                "icon":
                data['DomainElement'][(ld['feature_alphanumid'],
                                       ld['value'])].jsondata,
                "family":
                FamilyCodes[str(family)]
            },
            comment=ld["comment"],
            valueset=vs,
            contributed_datapoint=ld["contributor"])
        done.add(id_)
        '''if not ld.get('bibsources'):
            if 'bibsources' not in ld:
                args.log.warn("no bibsource %s" % ld)
            continue
        for k, _ in [ktfbib(bibsource) for bibsource in ld['bibsources'].split(",,,")]:
            common.ValueSetReference(valueset=vs, source=data['Source'][k])'''
    DBSession.flush()

    #To CLDF
    cldf = {}
    for ld in ldps:
        parameter = data['Feature'][ld['feature_alphanumid']]
        language = data['lsiLanguage'][ld['language_id']]
        id_ = '%s-%s' % (parameter.id, language.id)
        if not id_ in done:
            continue
        dt = (lgs[ld['language_id']], ld['language_id'],
              ld['feature_alphanumid'] + ". " + ld['feature_name'],
              ld["value"])
        cldf[dt] = None

    tab = lambda rows: u''.join([u'\t'.join(row) + u"\n" for row in rows])
    savu(tab([("Language", "iso-639-3", "Feature", "Value")] + cldf.keys()),
         "lsi.cldf")

    args.log.info('%s Errors' % len(errors))

    dataset = common.Dataset(
        id="LSI",
        name='Linguistic Survey of India',
        publisher_name="Sprakbanken",
        publisher_place="Gothenburg",
        publisher_url="to be given",
        description="this is to be followed",
        domain='http://lsi.clld.org',
        published=date(2016, 05, 16),
        contact='*****@*****.**',
        license='http://creativecommons.org/licenses/by-nc-nd/2.0/de/deed.en',
        jsondata={
            'license_icon':
            'http://wals.info/static/images/cc_by_nc_nd.png',
            'license_name':
            'Creative Commons Attribution-NonCommercial-NoDerivs 2.0 Germany'
        })

    # disabled for experimental purposes, names were appearing multiple times
    for i, contributor in enumerate([
            common.Contributor(id="Lars Borin",
                               name="Lars Borin",
                               email="*****@*****.**"),
            common.Contributor(id="Shafqat Mumtaz Virk",
                               name="Shafqat Mumtaz Virk",
                               email="*****@*****.**"),
            common.Contributor(id="Anju Saxena",
                               name="Anju Saxena",
                               email="*****@*****.**"),
            common.Contributor(id="Harald Hammarstrom",
                               name="Harald Hammarstrom",
                               email="*****@*****.**")
    ]):
        #print i
        common.Editor(dataset=dataset, contributor=contributor, ord=i)
    '''cont1 = common.Contributor(
            id="Harald Hammarstrom",
            name="Harald Hammarstrom",
            email="*****@*****.**")
    cont2= common.Contributor(
            id="Shafqat Mumtaz Virk",
            name="Shafqat Mumtaz Virk",
            email="*****@*****.**")
    cont3 = common.Contributor(
            id="Lars Borin",
            name="Lars Borin",
            email="*****@*****.**")
    for contributor in [cont1,cont2,cont3]:
        common.Editor(dataset=dataset, contributor=contributor,ord=1)'''

    DBSession.add(dataset)
    DBSession.flush()
Ejemplo n.º 8
0
def import_dataset(path, provider):
    # look for metadata
    # look for sources
    # then loop over values
    dirpath, fname = os.path.split(path)
    basename, ext = os.path.splitext(fname)
    glottolog = Glottolog()

    mdpath = path + "-metadata.json"
    assert os.path.exists(mdpath)
    md = jsonload(mdpath)
    md, parameters = md["properties"], md["parameters"]

    cname = md["name"]
    if "id" in md:
        cname = "%s [%s]" % (cname, md["id"])
    contrib = Wordlist(id=basename, name=cname)
    contributors = md.get("typedby", md.get("contributors"))

    if contributors:
        contributor_name = HumanName(contributors)
        contributor_id = slug(contributor_name.last + contributor_name.first)
        contributor = Contributor.get(contributor_id, default=None)
        if not contributor:
            contributor = Contributor(id=contributor_id, name="%s" % contributor_name)

        DBSession.add(ContributionContributor(contribution=contrib, contributor=contributor))

    # bibpath = os.path.join(dirpath, basename + '.bib')
    # if os.path.exists(bibpath):
    #    for rec in Database.from_file(bibpath):
    #        if rec['key'] not in data['Source']:
    #            data.add(Source, rec['key'], _obj=bibtex2source(rec))

    data = Data()
    concepts = {p.id: p for p in DBSession.query(Concept)}
    language = None

    for i, row in enumerate(reader(path, dicts=True, delimiter=",")):
        if not row["Value"] or not row["Feature_ID"]:
            continue

        fid = row["Feature_ID"].split("/")[-1]
        vsid = "%s-%s-%s" % (basename, row["Language_ID"], fid)
        vid = "%s-%s-%s" % (provider, basename, i + 1)

        if language:
            assert language.id == row["Language_ID"]
        else:
            language = Language.get(row["Language_ID"], default=None)
            if language is None:
                # query glottolog!
                languoid = glottolog.languoid(row["Language_ID"])
                language = LexibankLanguage(
                    id=row["Language_ID"], name=languoid.name, latitude=languoid.latitude, longitude=languoid.longitude
                )

        parameter = concepts.get(fid)
        if parameter is None:
            concepts[fid] = parameter = Concept(
                id=fid, name=parameters[row["Feature_ID"]], concepticon_url=row["Feature_ID"]
            )

        vs = data["ValueSet"].get(vsid)
        if vs is None:
            vs = data.add(
                ValueSet,
                vsid,
                id=vsid,
                parameter=parameter,
                language=language,
                contribution=contrib,
                source=row.get("Source"),
            )

        counterpart = Counterpart(
            id=vid, valueset=vs, name=row["Value"], description=row.get("Comment"), loan=row.get("Loan") == "yes"
        )

        if row.get("Cognate_Set"):
            csid = row["Cognate_Set"].split(",")[0].strip()
            cs = Cognateset.get(csid, key="name", default=None)
            if cs is None:
                cs = Cognateset(name=csid)
            counterpart.cognateset = cs

        # for key, src in data['Source'].items():
        #    if key in vs.source:
        #        ValueSetReference(valueset=vs, source=src, key=key)

    contrib.language = language