def get_clf_paths(lgs): glottolog = Glottolog() for lg in lgs: l = glottolog.languoid(lg) ancestors = [l.id] while l.parent: ancestors.insert(0, l.parent.id) l = l.parent yield tuple(ancestors)
def test_Glottolog(self): from clldclient.glottolog import Glottolog with patch('clldclient.database.Cache', new=lambda: MockCache('glottolog_')): gl = Glottolog() gl.languoid('deu') deu = gl.languoid('http://glottolog.org/resource/languoid/id/stan1295') self.assertAlmostEquals(deu.longitude, 12.4676) assert deu.latitude self.assertEquals(deu.family.name, 'Indo-European') self.assertEquals(deu.parent.name, 'High Franconian') self.assertEquals(len(list(deu.children)), 3) self.assertEquals(deu.macroareas, ['Eurasia'])
def load_families(data, languages, glottolog=None, icons=ORDERED_ICONS, isolates_icon=ISOLATES_ICON): """Add Family objects to a database and update Language object from Glottolog. Family information is retrieved from Glottolog based on the id attribute of a language. This id must be either a glottocode or an ISO 639-3 code. :param data: :return: """ icons = cycle([ getattr(i, 'name', i) for i in icons if getattr(i, 'name', i) != isolates_icon ]) glottolog = glottolog or Glottolog() for language in languages: if isinstance(language, (tuple, list)) and len(language) == 2: code, language = language else: code = language.id if code != '-': gl_language = glottolog.languoid(code) if gl_language: gl_family = gl_language.family if gl_family: family = data['Family'].get(gl_family.id) #print family if not family: family = data.add( Family, gl_family.id, id=gl_family.id, name=gl_family.name, description=Identifier( name=gl_family.id, type=IdentifierType.glottolog.value).url(), jsondata=dict(icon=next(icons))) language.family = family language.macroarea = gl_language.macroareas[0] add_language_codes(data, language, gl_language.iso_code, glottocode=gl_language.id) for attr in 'latitude', 'longitude', 'name': if getattr(language, attr) is None: setattr(language, attr, getattr(gl_language, attr)) else: language.macroarea = None
def import_dataset(path, data, icons): # look for metadata # look for sources # then loop over values dirpath, fname = os.path.split(path) basename, ext = os.path.splitext(fname) glottolog = Glottolog() contrib = Contribution(id=basename, name=basename) md = {} mdpath = path + '-metadata.json' if os.path.exists(mdpath): md = jsonload(mdpath) contributor_name = HumanName(md.get('contributed_datapoint', 'Team NTS')) contributor_id = slug(contributor_name.last + contributor_name.first) contributor = data['Contributor'].get(contributor_id) if not contributor: contributor = data.add( Contributor, contributor_id, id=contributor_id, name='%s' % contributor_name) DBSession.add(ContributionContributor(contribution=contrib, contributor=contributor)) bibpath = os.path.join(dirpath, basename + '.bib') if os.path.exists(bibpath): for rec in Database.from_file(bibpath): if rec['key'] not in data['Source']: data.add(Source, rec['key'], _obj=bibtex2source(rec)) languages = {f['properties']['glottocode']: f for f in md.get('features', [])} for i, row in enumerate(reader(path, dicts=True, quoting=csv.QUOTE_NONE, delimiter=',' if 'c' in ext else '\t')): if not row['Value'] or not row['Feature_ID']: continue vsid = '%s-%s-%s' % (basename, row['Language_ID'], row['Feature_ID']) vid = row.get('ID', '%s-%s' % (basename, i + 1)) parameter = data['Feature'].get(row['Feature_ID']) if parameter is None: print('skip value for invalid feature %s' % row['Feature_ID']) continue #parameter = data.add( # Feature, row['Feature_ID'], id=row['Feature_ID'], name=row.get('Feature', row['Feature_ID'])) language = data['GrambankLanguage'].get(row['Language_ID']) if language is None: # query glottolog! languoid = glottolog.languoid(row['Language_ID']) gl_md = { 'name': languoid.name, 'longitude': languoid.longitude, 'latitude': languoid.latitude} lmd = languages.get(row['Language_ID']) if lmd: if lmd.get('properties', {}).get('name'): gl_md['name'] = lmd['properties']['name'] if lmd.get('geometry', {}).get('coordinates'): gl_md['longitude'], gl_md['latitude'] = lmd['geometry']['coordinates'] language = data.add( GrambankLanguage, row['Language_ID'], id=row['Language_ID'], name=gl_md['name'], latitude=gl_md.get('latitude'), longitude=gl_md.get('longitude')) vs = data['ValueSet'].get(vsid) if vs is None: vs = data.add( ValueSet, vsid, id=vsid, parameter=parameter, language=language, contribution=contrib, source=row['Source']) domain = {de.abbr: de for de in parameter.domain} name = row['Value'] if name in domain: name = domain[name].name Value( id=vid, valueset=vs, name=name, description=row['Comment'], domainelement=domain.get(row['Value'])) for key, src in data['Source'].items(): if key in vs.source: ValueSetReference(valueset=vs, source=src, key=key)
def import_dataset(path, data, icons, add_missing_features = False): # look for metadata # look for sources # then loop over values dirpath, fname = os.path.split(path) basename, ext = os.path.splitext(fname) glottolog = Glottolog() try: contrib = CulturebankContribution(id=basename, name=basename, desc=glottolog.languoid(basename).name) except: print("Basename {:s} did not match a glottolog languoid, skipped.".format(basename)) return md = {} mdpath = path + '-metadata.json' if os.path.exists(mdpath): md = jsonload(mdpath) contributor_name = HumanName(md.get('contributed_datapoint', 'Team NTS')) contributor_id = slug(contributor_name.last + contributor_name.first) contributor = data['Contributor'].get(contributor_id) if not contributor: contributor = data.add( Contributor, contributor_id, id=contributor_id, name='%s' % contributor_name) DBSession.add(ContributionContributor(contribution=contrib, contributor=contributor)) bibpath = os.path.join(dirpath, basename + '.bib') if os.path.exists(bibpath): for rec in Database.from_file(bibpath): if rec['key'] not in data['Source']: data.add(Source, rec['key'], _obj=bibtex2source(rec)) languages = {f['properties']['glottocode']: f for f in md.get('features', [])} for i, row in pandas.io.parsers.read_csv( path, sep=',' if 'c' in ext else '\t', encoding='utf-16').iterrows(): if pandas.isnull(row['Value']) or pandas.isnull(row['Feature_ID']): print("Expected columns not found: ", row) continue vsid = '%s-%s-%s' % (basename, row['Language_ID'], row['Feature_ID']) vid = row.get('ID', '%s-%s' % (basename, i + 1)) parameter = data['Feature'].get(row['Feature_ID']) if parameter is None: if add_missing_features: parameter = data.add(Feature, row['Feature_ID'], id=row['Feature_ID'], name=row.get('Feature', row['Feature_ID'])) else: print(('skip value for invalid feature %s' % row['Feature_ID'])) continue language = data['CulturebankLanguage'].get(row['Language_ID']) if language is None: # query glottolog! try: languoid = glottolog.languoid(row['Language_ID']) except AttributeError: print(('Skipping, no Glottocode found for %s' % row['Language_ID'])) continue gl_md = { 'name': languoid.name, 'longitude': languoid.longitude, 'latitude': languoid.latitude} lmd = languages.get(row['Language_ID']) if lmd: if lmd.get('properties', {}).get('name'): gl_md['name'] = lmd['properties']['name'] if lmd.get('geometry', {}).get('coordinates'): gl_md['longitude'], gl_md['latitude'] = lmd['geometry']['coordinates'] language = data.add( CulturebankLanguage, row['Language_ID'], id=row['Language_ID'], name=gl_md['name'], latitude=gl_md.get('latitude'), longitude=gl_md.get('longitude')) vs = data['ValueSet'].get(vsid) if vs is None: vs = data.add( ValueSet, vsid, id=vsid, parameter=parameter, language=language, contribution=contrib, source=row['Source']) domain = {de.abbr: de for de in parameter.domain} name = row['Value'] if name in domain: name = domain[name].name else: name = str(name) if name in domain: name = domain[name].name else: raise ValueError("For feature {:s} in language {:s}: Name {:s} not found among domain values {:}".format( row['Language_ID'], row['Feature_ID'], name, {d: de for d, de in domain.items()})) data.add(Value, vid, id=vid, valueset=vs, name=name, description=row['Comment'], domainelement=domain.get(row['Value'])) print(".", end="") if vs.source is not None: for key, src in list(data['Source'].items()): if key in vs.source: ValueSetReference(valueset=vs, source=src, key=key)
def load_families(data, languages, glottolog=None, icons=ORDERED_ICONS, isolates_icon=ISOLATES_ICON): """Add Family objects to a database and update Language object from Glottolog. Family information is retrieved from Glottolog based on the id attribute of a language. This id must be either a glottocode or an ISO 639-3 code. :param data: :return: """ icons = cycle([ getattr(i, 'name', i) for i in icons if getattr(i, 'name', i) != isolates_icon ]) glottolog = glottolog or Glottolog() print len(languages), languages for language in languages: if isinstance(language, (tuple, list)) and len(language) == 2: code, language = language else: code = language.id print language, code if code != '-': gl_language = glottolog.languoid(code) if gl_language: gl_family = gl_language.family if gl_family and gl_family.name in [ 'Sino-Tibetan', 'Dravidian', 'Indo-European', 'Austroasiatic' ]: # the second condition is added by me (shafqat) family = data['Family'].get(gl_family.id) #print 'this one' #print gl_family.name if not family: family = data.add( Family, gl_family.id, id=gl_family.id, name=gl_family.name, description=Identifier( name=gl_family.id, type=IdentifierType.glottolog.value).url(), ##jsondata=dict(icon=next(icons))) jsondata=dict(icon=custom_icons[gl_family.name]) ) ## based on family, we can use different icons if we like as needed in case of LSI language.family = family language.macroarea = gl_language.macroareas[0] add_language_codes(data, language, gl_language.iso_code, glottocode=gl_language.id) for attr in 'latitude', 'longitude', 'name': if getattr(language, attr) is None: setattr(language, attr, getattr(gl_language, attr)) else: language.macroarea = None
def main(args): data = Data(created=utc.localize(datetime(2013, 11, 15)), updated=utc.localize(datetime(2013, 12, 12))) icons = issues.Icons() #print icons DBSession.execute("delete from Language") DBSession.execute("delete from Unit") DBSession.execute("delete from featuredomain") DBSession.execute("delete from family") DBSession.execute("delete from source") DBSession.execute("delete from parameter") DBSession.execute("delete from feature") DBSession.execute("delete from domainelement") DBSession.execute("delete from valueset") DBSession.execute("delete from value") DBSession.execute("delete from lsivalue") DBSession.execute("delete from dataset") DBSession.execute("delete from contributor") DBSession.execute("delete from lsilanguage") DBSession.execute("delete from contribution") DBSession.execute("delete from designer") DBSession.flush() dtab = partial(_dtab, args.data_file()) #Languages #print args.data_file() #tabfns = ['%s' % fn.basename() for fn in args.data_file().files('nts_*.tab')] #tabfns = ['nts_18.tab'] ##tabfns = os.listdir('/Users/virk/shafqat/postDoc-Swe/project/clld/clld/lsi/lsi/data')[1:] tabfns = os.listdir( '/Users/virk/shafqat/postDoc-Swe/project/clld/clld/lsi/lsi/lsi_data' )[1:] #print tabfns args.log.info("Sheets found: %s" % tabfns) ldps = [] lgs = {} nfeatures = Counter() nlgs = Counter() for fn in tabfns: for ld in dtab(fn): if ld['language_id'] == 'qgr' or ld['language_id'] == '---' or ld[ 'language_id'] == '': # to exclude languages which do not have an iso-code continue if "feature_alphanumid" not in ld: args.log.info("NO FEATUREID %s %s" % (len(ld), ld)) if not ld["feature_alphanumid"].startswith("DRS") \ and ld["feature_alphanumid"].find(".") == -1: ldps.append(dp_dict(ld)) ##print ld lgs[ld['language_id']] = unescape(ld['language_name']) if ld["value"] != "?": nfeatures.update([ld['language_id']]) nlgs.update([ld['feature_alphanumid']]) ldps = sorted(ldps, key=lambda d: d['feature_alphanumid']) #lgs["ygr"] = "Hua" for lgid, lgname in lgs.items(): data.add(models.lsiLanguage, lgid, id=lgid, name=lgname, representation=nfeatures.get(lgid, 0)) DBSession.flush() #print "I am here" #print data['ntsLanguage'].values()[1].id load_families( data, ##[(NOCODE_TO_GLOTTOCODE.get(l.id, l.id), l) for l in data['lsiLanguage'].values()], [(NOCODE_TO_GLOTTOCODE.get(l.id, l.id), l) for l in data['lsiLanguage'].values() if l.id != '---' and l.id != ''], isolates_icon='tcccccc') #print 'family' #print data['Family'].get('sino1245').jsondata #Domains for domain in set(ld['feature_domain'] for ld in ldps): data.add(models.FeatureDomain, domain, name=domain) DBSession.flush() #Designers #for i, info in enumerate(dtab("ntscontributions.tab") + dtab("ntscontacts.tab")): for i, info in enumerate([{ 'designer': 'shafqat', 'domain': '', 'pdflink': '', 'citation': '' }, { 'designer': '-', 'domain': '', 'pdflink': '', 'citation': '' }]): designer_id = str(i + 1) data.add(models.Designer, info['designer'], id=designer_id, name=designer_id, domain=info["domain"], contributor=info['designer'], pdflink=info["pdflink"], citation=info["citation"]) DBSession.flush() #Sources '''for k, (typ, bibdata) in [ ktfbib(bibsource) for ld in ldps if ld.get(u'bibsources') for bibsource in ld['bibsources'].split(",,,") ]: if k not in data["Source"]: data.add(common.Source, k, _obj=bibtex2source(Record(typ, k, **bibdata))) DBSession.flush()''' #Features fs = [(fid, mergeds(lds)) for fid, lds in groupby(ldps, key=lambda d: d['feature_alphanumid'])] fvdesc = [(fid, [ (ld.get("feature_possible_values"), ld.get("fromfile")) for ld in lds if ld.get("feature_possible_values") ]) for fid, lds in groupby(ldps, key=lambda d: d['feature_alphanumid'])] fvdt = [(fid, grp2(vdescs)) for (fid, vdescs) in fvdesc] fvmis = [(fid, vdescs) for (fid, vdescs) in fvdt if len(vdescs) > 1] for _, dfsids in groupby(sorted( (f.get('feature_name', fid), fid) for fid, f in fs), key=lambda t: t[0]): ##print [(k,v) for (k,v) in list(dfsids)],len(list(dfsids)) assert len(list(dfsids)) == 1 #print 'here is nlgs' for fid, f in fs: #print "lang name" #print ldps #print f.get('feature_possible_values', ""), if not fid.isdigit(): args.log.info("NO INT FID %s" % f) feature = data.add( models.Feature, fid, id=fid, name=f.get('feature_name', f['feature_alphanumid']), doc=f.get('feature_information', ""), vdoc=f.get('feature_possible_values', ""), representation=nlgs.get(fid, 0), designer=data["Designer"][f['designer']], dependson=f.get("depends_on", ""), abbreviation=f.get("abbreviation", ""), featuredomain=data['FeatureDomain'][f["feature_domain"]], name_french=f.get('francais', ""), clarification=f.get( "draft of clarifying comments to outsiders (hedvig + dunn + harald + suzanne)", ""), alternative_id=f.get("old feature number", ""), jl_relevant_unit=f.get("relevant unit(s)", ""), jl_function=f.get("function", ""), jl_formal_means=f.get("formal means", ""), sortkey_str="", sortkey_int=int(fid)) vdesclist = [veq.split("==") for veq in feature.vdoc.split("||")] vdesc = {v.replace(".", "-"): desc for [v, desc] in vdesclist} ##vdesc = {fmly+val2icon(v): desc for ((v,desc),fmly) in itertools.product([(vv,desc) for [vv, desc] in vdesclist],['c','d','f','t'])} vdesc.setdefault('?', 'Not known') if 'N/A' not in vdesc and feature.dependson: vdesc["N/A"] = "Not Applicable" vi = {v: i for (i, v) in enumerate(sorted(vdesc.keys()))} ##vicons = {f+val2icon(v):f+val2icon(v) for (v,f) in itertools.product(['0','1','2','3'],['c','d','f','t'])} ##vicons['?'] = 'c00ffff' ##vicons['N/A'] = 'c00ffff' ##vicons = icons.iconize(vi.keys()) for (v, desc) in vdesc.items(): #print v,vicons[v] data.add(common.DomainElement, (fid, v), id='%s-%s' % (fid, v), name=v, description=desc, jsondata={"icon": Colors[v]}, number=vi[v], parameter=feature) DBSession.flush() for ((f, lg), ixs) in grp2([((ld['feature_alphanumid'], ld['language_id']), i) for i, ld in enumerate(ldps)]): ixvs = set([ldps[ix]['value'] for ix in ixs]) if len(ixvs) == 1: continue args.log.warn("Dup value %s %s %s" % (f, lg, [(ldps[ix]['value'], ldps[ix]['fromfile']) for ix in ixs])) ##print "Dup value %s %s %s" % (f, lg, [(ldps[ix]['value'], ldps[ix]['fromfile'], ldps[ix].get('provenance')) for ix in ixs]) errors = {} done = set() glottolog = Glottolog() for ld in ldps: ############################### for printing different map markers for different familys for features:shafqat #print data['Family'] language = data['lsiLanguage'][ld['language_id']] if isinstance(language, (tuple, list)) and len(language) == 2: code, language = language else: code = language.id if code != '-': gl_language = glottolog.languoid(code) if gl_language: gl_family = gl_language.family if gl_family: family = data['Family'].get(gl_family.id) ##ld['value'] = ld['value']+'-'+str(family) ##ld['value'] = combineValueFamily(ld['value'],str(family)) #print family ##################################### parameter = data['Feature'][ld['feature_alphanumid']] language = data['lsiLanguage'][ld['language_id']] id_ = '%s-%s' % (parameter.id, language.id) if id_ in done: continue if (ld['feature_alphanumid'], ld['value']) not in data['DomainElement']: if not ld["value"].strip(): continue info = (ld['feature_alphanumid'], ld.get('feature_name', "[Feature Name Lacking]"), ld['language_id'], ld['value'], ld['fromfile']) msg = u"%s %s %s %s %s not in the set of legal values ({0})" % info args.log.error( msg.format( sorted([ y for (x, y) in data['DomainElement'].keys() if x == ld['feature_alphanumid'] ]))) ##print msg.format(sorted( ## [y for (x, y) in data['DomainElement'].keys() ## if x == ld['feature_alphanumid']])) errors[(ld['feature_alphanumid'], ld['language_id'])] = info continue vs = common.ValueSet( id=id_, language=language, parameter=parameter, source=ld["source"] or None, ##contribution=parameter.designer ) #print #print "this one" #print ld['value'],family models.lsiValue( id=id_, domainelement=data['DomainElement'][(ld['feature_alphanumid'], ld['value'])], jsondata={ "icon": data['DomainElement'][(ld['feature_alphanumid'], ld['value'])].jsondata, "family": FamilyCodes[str(family)] }, comment=ld["comment"], valueset=vs, contributed_datapoint=ld["contributor"]) done.add(id_) '''if not ld.get('bibsources'): if 'bibsources' not in ld: args.log.warn("no bibsource %s" % ld) continue for k, _ in [ktfbib(bibsource) for bibsource in ld['bibsources'].split(",,,")]: common.ValueSetReference(valueset=vs, source=data['Source'][k])''' DBSession.flush() #To CLDF cldf = {} for ld in ldps: parameter = data['Feature'][ld['feature_alphanumid']] language = data['lsiLanguage'][ld['language_id']] id_ = '%s-%s' % (parameter.id, language.id) if not id_ in done: continue dt = (lgs[ld['language_id']], ld['language_id'], ld['feature_alphanumid'] + ". " + ld['feature_name'], ld["value"]) cldf[dt] = None tab = lambda rows: u''.join([u'\t'.join(row) + u"\n" for row in rows]) savu(tab([("Language", "iso-639-3", "Feature", "Value")] + cldf.keys()), "lsi.cldf") args.log.info('%s Errors' % len(errors)) dataset = common.Dataset( id="LSI", name='Linguistic Survey of India', publisher_name="Sprakbanken", publisher_place="Gothenburg", publisher_url="to be given", description="this is to be followed", domain='http://lsi.clld.org', published=date(2016, 05, 16), contact='*****@*****.**', license='http://creativecommons.org/licenses/by-nc-nd/2.0/de/deed.en', jsondata={ 'license_icon': 'http://wals.info/static/images/cc_by_nc_nd.png', 'license_name': 'Creative Commons Attribution-NonCommercial-NoDerivs 2.0 Germany' }) # disabled for experimental purposes, names were appearing multiple times for i, contributor in enumerate([ common.Contributor(id="Lars Borin", name="Lars Borin", email="*****@*****.**"), common.Contributor(id="Shafqat Mumtaz Virk", name="Shafqat Mumtaz Virk", email="*****@*****.**"), common.Contributor(id="Anju Saxena", name="Anju Saxena", email="*****@*****.**"), common.Contributor(id="Harald Hammarstrom", name="Harald Hammarstrom", email="*****@*****.**") ]): #print i common.Editor(dataset=dataset, contributor=contributor, ord=i) '''cont1 = common.Contributor( id="Harald Hammarstrom", name="Harald Hammarstrom", email="*****@*****.**") cont2= common.Contributor( id="Shafqat Mumtaz Virk", name="Shafqat Mumtaz Virk", email="*****@*****.**") cont3 = common.Contributor( id="Lars Borin", name="Lars Borin", email="*****@*****.**") for contributor in [cont1,cont2,cont3]: common.Editor(dataset=dataset, contributor=contributor,ord=1)''' DBSession.add(dataset) DBSession.flush()
def import_dataset(path, provider): # look for metadata # look for sources # then loop over values dirpath, fname = os.path.split(path) basename, ext = os.path.splitext(fname) glottolog = Glottolog() mdpath = path + "-metadata.json" assert os.path.exists(mdpath) md = jsonload(mdpath) md, parameters = md["properties"], md["parameters"] cname = md["name"] if "id" in md: cname = "%s [%s]" % (cname, md["id"]) contrib = Wordlist(id=basename, name=cname) contributors = md.get("typedby", md.get("contributors")) if contributors: contributor_name = HumanName(contributors) contributor_id = slug(contributor_name.last + contributor_name.first) contributor = Contributor.get(contributor_id, default=None) if not contributor: contributor = Contributor(id=contributor_id, name="%s" % contributor_name) DBSession.add(ContributionContributor(contribution=contrib, contributor=contributor)) # bibpath = os.path.join(dirpath, basename + '.bib') # if os.path.exists(bibpath): # for rec in Database.from_file(bibpath): # if rec['key'] not in data['Source']: # data.add(Source, rec['key'], _obj=bibtex2source(rec)) data = Data() concepts = {p.id: p for p in DBSession.query(Concept)} language = None for i, row in enumerate(reader(path, dicts=True, delimiter=",")): if not row["Value"] or not row["Feature_ID"]: continue fid = row["Feature_ID"].split("/")[-1] vsid = "%s-%s-%s" % (basename, row["Language_ID"], fid) vid = "%s-%s-%s" % (provider, basename, i + 1) if language: assert language.id == row["Language_ID"] else: language = Language.get(row["Language_ID"], default=None) if language is None: # query glottolog! languoid = glottolog.languoid(row["Language_ID"]) language = LexibankLanguage( id=row["Language_ID"], name=languoid.name, latitude=languoid.latitude, longitude=languoid.longitude ) parameter = concepts.get(fid) if parameter is None: concepts[fid] = parameter = Concept( id=fid, name=parameters[row["Feature_ID"]], concepticon_url=row["Feature_ID"] ) vs = data["ValueSet"].get(vsid) if vs is None: vs = data.add( ValueSet, vsid, id=vsid, parameter=parameter, language=language, contribution=contrib, source=row.get("Source"), ) counterpart = Counterpart( id=vid, valueset=vs, name=row["Value"], description=row.get("Comment"), loan=row.get("Loan") == "yes" ) if row.get("Cognate_Set"): csid = row["Cognate_Set"].split(",")[0].strip() cs = Cognateset.get(csid, key="name", default=None) if cs is None: cs = Cognateset(name=csid) counterpart.cognateset = cs # for key, src in data['Source'].items(): # if key in vs.source: # ValueSetReference(valueset=vs, source=src, key=key) contrib.language = language