def bibtex2source(rec, cls=common.Source): year = bibtex.unescape(rec.get("year", "nd")) fields = {} jsondata = {} for field in bibtex.FIELDS: if field in rec: value = bibtex.unescape(rec[field]) container = fields if hasattr(cls, field) else jsondata container[field] = value etal = "" eds = "" authors = rec.get("author") if not authors: authors = rec.get("editor", "") if authors: eds = " (eds.)" if authors: authors = bibtex.unescape(authors).split(" and ") if len(authors) > 2: authors = authors[:1] etal = " et al." authors = [HumanName(a) for a in authors] authors = [n.last or n.first for n in authors] authors = "%s%s%s" % (" and ".join(authors), etal, eds) return cls( id=slug(rec.id), name=("%s %s" % (authors, year)).strip(), description=bibtex.unescape(rec.get("title", rec.get("booktitle", ""))), jsondata=jsondata, bibtex_type=rec.genre, **fields )
def bibtex2source(rec, cls=common.Source): year = bibtex.unescape(rec.get('year', 'nd')) fields = {} jsondata = {} for field in bibtex.FIELDS: if field in rec: value = bibtex.unescape(rec[field]) container = fields if hasattr(cls, field) else jsondata container[field] = value etal = '' eds = '' authors = rec.get('author') if not authors: authors = rec.get('editor', '') if authors: eds = ' (eds.)' if authors: authors = bibtex.unescape(authors).split(' and ') if len(authors) > 2: authors = authors[:1] etal = ' et al.' authors = [HumanName(a) for a in authors] authors = [n.last or n.first for n in authors] authors = '%s%s%s' % (' and '.join(authors), etal, eds) return cls(id=slug(rec.id), name=('%s %s' % (authors, year)).strip(), description=bibtex.unescape( rec.get('title', rec.get('booktitle', ''))), jsondata=jsondata, bibtex_type=rec.genre, **fields)
def bibtex2source(rec, cls=common.Source, lowercase_id=False): year = bibtex.unescape(rec.get('year', 'nd')) fields = {} jsondata = {} for field in bibtex.FIELDS: if field in rec: value = bibtex.unescape(rec[field]) container = fields if hasattr(cls, field) else jsondata container[field] = value etal = '' eds = '' authors = rec.get('author') if not authors: authors = rec.get('editor', '') if authors: eds = ' (eds.)' if authors: authors = bibtex.unescape(authors).split(' and ') if len(authors) > 2: authors = authors[:1] etal = ' et al.' authors = [HumanName(a) for a in authors] authors = [n.last or n.first for n in authors] authors = '%s%s%s' % (' and '.join(authors), etal, eds) return cls( id=slug(rec.id, lowercase=lowercase_id), name=('%s %s' % (authors, year)).strip(), description=bibtex.unescape(rec.get('title', rec.get('booktitle', ''))), jsondata=jsondata, bibtex_type=rec.genre, **fields)
def test_unescape(self): from clld.lib.bibtex import unescape, u_unescape self.assertEqual(unescape(binary_type("\\ss \xef".encode('latin1'))), 'ß\xef') self.assertEqual(unescape("\\ss "), 'ß') self.assertEqual(u_unescape('?[\\u123] ?[\\u1234]'), '{ \u04d2') s = '\u2013' self.assertEqual(s, unescape(s)) self.assertEqual(unescape('?[\\u65533]'), '\ufffd')
def bibtex2source(rec): # pragma: no cover year = rec.get('year', 'nd') fields = {} jsondata = {} for field in FIELDS: if field in rec: value = unescape(rec[field]) container = fields if hasattr(common.Source, field) else jsondata container[field] = value # remove \\ from url fields! if field == 'url': container[field] = container[field].replace('\\', '') etal = '' eds = '' authors = rec.get('author') if not authors: authors = rec.get('editor', '') if authors: eds = ' (eds.)' if authors: authors = unescape(authors).split(' and ') if len(authors) > 2: authors = authors[:1] etal = ' et al.' authors = [HumanName(a) for a in authors] authors = [n.last or n.first for n in authors] authors = '%s%s%s' % (' and '.join(authors), etal, eds) if rec.genre == 'thesis': if rec['type'] == 'phdthesis': rec.genre = 'phdthesis' else: rec.genre = 'mastersthesis' try: bibtex_type = EntryType.from_string(rec.genre) except: bibtex_type = EntryType.from_string('misc') return common.Source(id=rec.id, name=('%s %s' % (authors, year)).strip(), description=unescape( rec.get('title', rec.get('booktitle', ''))), jsondata=jsondata, bibtex_type=bibtex_type, **fields)
def bibtex2source(rec): year = bibtex.unescape(rec.get('year', 'nd')) fields = {} jsondata = {} for field in bibtex.FIELDS: if field in rec: value = bibtex.unescape(rec[field]) container = fields if hasattr(common.Source, field) else jsondata container[field] = value return common.Source( id=slug(rec.id), name=('%s %s' % (bibtex.unescape( rec.get('author', rec.get('editor', ''))), year)).strip(), description=bibtex.unescape(rec.get('title', rec.get('booktitle', ''))), jsondata=jsondata, bibtex_type=rec.genre, **fields)
def unescape_dict(d): for o, n in [(r'Cura\c{c}ao', "Curaçao"), ("Saint Barth\\'elemy", "Saint-Barthélemy")]: if o in d: d[n.decode('utf8')] = d[o] del d[o] for k, v in d.items(): if isinstance(v, dict): d[k] = unescape_dict(v) elif isinstance(v, StringTypes): d[k] = unescape(v).replace("Barth\\'elemy", "Barthélemy".decode('utf8')) return d
def convert(key, value): #if key == 'code+name': # return value.replace('\\', "\\\\") if key == 'coordinates': lon, lat = eval(value) return dict(latitude=lat, longitude=lon) if key == 'population_numeric': return int(value) if key in [ 'alternate_names', 'classification', 'classification-hh', 'country', 'dialects', ]: return split(unescape(value)) if key == 'typology': return split(unescape(value), sep=';') if key == 'writing': return split(unescape(value), sep='.') if key == 'also_spoken_in': return unescape_dict(eval(value)) return unescape(value)
def main(args): """ The case is we have to codings for two different dialects (called hua and yagaria) of the same iso "qgr", both of which we want to keep and keep separately. I had missed that when making NTS, rigging everything so that the iso would be the id, which is not sufficient. Glottocodes in Grambank would have taken care of it except the dialect division for yaga1260 is wrong, having yagaria as overarching and Hua under it (reality has it that Hua and Yagaria are two dialects of the same language, which has no name). So a solution with glottocodes would have to wait until we fix that or need another fix later. So I guess, for now, let's ignore qgr (and its datapoints) and I'll fix on my end later. """ data = Data( created=utc.localize(datetime(2013, 11, 15)), updated=utc.localize(datetime(2013, 12, 12))) icons = issues.Icons() dtab = partial(_dtab, args.data_file()) #Languages tabfns = ['%s' % fn.name for fn in args.data_file().glob('nts_*.tab')] args.log.info("Sheets found: %s" % tabfns) ldps = [] lgs = {} nfeatures = Counter() nlgs = Counter() for fn in tabfns: for ld in dtab(fn): if ld['language_id'] == 'qgr': continue if "feature_alphanumid" not in ld: args.log.info("NO FEATUREID %s %s" % (len(ld), ld)) if not ld["feature_alphanumid"].startswith("DRS") \ and ld["feature_alphanumid"].find(".") == -1: ldps.append(dp_dict(ld)) lgs[ld['language_id']] = unescape(ld['language_name']) if ld["value"] != "?": nfeatures.update([ld['language_id']]) nlgs.update([ld['feature_alphanumid']]) ldps = sorted(ldps, key=lambda d: d['feature_alphanumid']) lgs["ygr"] = "Hua" for lgid, lgname in lgs.items(): data.add( models.ntsLanguage, lgid, id=lgid, name=lgname, representation=nfeatures.get(lgid, 0)) DBSession.flush() load_families(data, [(NOCODE_TO_GLOTTOCODE.get(l.id, l.id), l) for l in data['ntsLanguage'].values()], isolates_icon='tcccccc') #glottolog = Glottolog() #for lg in data['ntsLanguage'].values(): # print lg.id, NOCODE_TO_GLOTTOCODE.get(lg.id, lg.id) # gl_language = glottolog.languoid(NOCODE_TO_GLOTTOCODE.get(lg.id, lg.id)) # if not gl_language.family: # family = data.add(Family, gl_language.id, id = gl_language.id, name = gl_language.name, description=common.Identifier(name=gl_language.id, type=common.IdentifierType.glottolog.value).url(), jsondata={"icon": 'tcccccc'}) # lg.family = family #Domains for domain in set(ld['feature_domain'] for ld in ldps): data.add(models.FeatureDomain, domain, name=domain) DBSession.flush() #Designers for i, info in enumerate(dtab("ntscontributions.tab") + dtab("ntscontacts.tab")): designer_id = str(i + 1) data.add( models.Designer, info['designer'], id=designer_id, name=designer_id, domain=info["domain"], contributor=info['designer'], pdflink=info["pdflink"], citation=info["citation"]) DBSession.flush() #Sources for k, (typ, bibdata) in [ ktfbib(bibsource) for ld in ldps if ld.get(u'bibsources') for bibsource in ld['bibsources'].split(",,,") ]: if k not in data["Source"]: data.add(common.Source, k, _obj=bibtex2source(Record(typ, k, **bibdata))) DBSession.flush() #Features fs = [(fid, mergeds(lds)) for fid, lds in groupby(ldps, key=lambda d: d['feature_alphanumid'])] fvdesc = [(fid, [(ld.get("feature_possible_values"), ld.get("fromfile")) for ld in lds if ld.get("feature_possible_values")]) for fid, lds in groupby(ldps, key=lambda d: d['feature_alphanumid'])] fvdt = [(fid, grp2(vdescs)) for (fid, vdescs) in fvdesc] fvmis = [(fid, vdescs) for (fid, vdescs) in fvdt if len(vdescs) > 1] for (fid, vdescs) in fvmis: print fid, "DIFF VDESC" for (vd, fromf) in vdescs: print vd, set(fromf) for _, dfsids in groupby( sorted((f.get('feature_name', fid), fid) for fid, f in fs), key=lambda t: t[0]): assert len(list(dfsids)) == 1 for fid, f in fs: if not fid.isdigit(): args.log.info("NO INT FID %s" % f) feature = data.add( models.Feature, fid, id=fid, name=f.get('feature_name', f['feature_alphanumid']), doc=f.get('feature_information', ""), vdoc=f.get('feature_possible_values', ""), representation=nlgs.get(fid, 0), designer=data["Designer"][f['designer']], dependson=f.get("depends_on", ""), abbreviation=f.get("abbreviation", ""), featuredomain=data['FeatureDomain'][f["feature_domain"]], name_french=f.get('francais', ""), clarification=f.get("draft of clarifying comments to outsiders (hedvig + dunn + harald + suzanne)", ""), alternative_id=f.get("old feature number", ""), jl_relevant_unit=f.get("relevant unit(s)", ""), jl_function=f.get("function", ""), jl_formal_means=f.get("formal means", ""), sortkey_str="", sortkey_int=int(fid)) vdesclist = [veq.split("==") for veq in feature.vdoc.split("||")] vdesc = {v.replace(".", "-"): desc for [v, desc] in vdesclist} vdesc.setdefault('?', 'Not known') if 'N/A' not in vdesc and feature.dependson: vdesc["N/A"] = "Not Applicable" vi = {v: i for (i, v) in enumerate(sorted(vdesc.keys()))} vicons = icons.iconize(vi.keys()) for v, desc in vdesc.items(): data.add( common.DomainElement, (fid, v), id='%s-%s' % (fid, v), name=v, description=desc, jsondata={"icon": vicons[v]}, number=vi[v], parameter=feature) DBSession.flush() for ((f, lg), ixs) in grp2( [((ld['feature_alphanumid'], ld['language_id']), i) for i, ld in enumerate(ldps)]): ixvs = set([ldps[ix]['value'] for ix in ixs]) if len(ixvs) == 1: continue args.log.warn( "Dup value %s %s %s" % (f, lg, [(ldps[ix]['value'], ldps[ix]['fromfile']) for ix in ixs])) print "Dup value %s %s %s" % (f, lg, [(ldps[ix]['value'], ldps[ix]['fromfile'], ldps[ix].get('provenance')) for ix in ixs]) errors = {} done = set() for ld in ldps: parameter = data['Feature'][ld['feature_alphanumid']] language = data['ntsLanguage'][ld['language_id']] id_ = '%s-%s' % (parameter.id, language.id) if id_ in done: continue if (ld['feature_alphanumid'], ld['value']) not in data['DomainElement']: if not ld["value"].strip(): continue info = ( ld['feature_alphanumid'], ld.get('feature_name', "[Feature Name Lacking]"), ld['language_id'], ld['value'], ld['fromfile']) msg = u"%s %s %s %s %s not in the set of legal values ({0})" % info args.log.error(msg.format(sorted( [y for (x, y) in data['DomainElement'].keys() if x == ld['feature_alphanumid']]))) print msg.format(sorted( [y for (x, y) in data['DomainElement'].keys() if x == ld['feature_alphanumid']])) errors[(ld['feature_alphanumid'], ld['language_id'])] = info continue vs = common.ValueSet( id=id_, language=language, parameter=parameter, source=ld["source"] or None, contribution=parameter.designer) models.ntsValue( id=id_, domainelement=data['DomainElement'][(ld['feature_alphanumid'], ld['value'])], jsondata={"icon": data['DomainElement'][(ld['feature_alphanumid'], ld['value'])].jsondata}, comment=ld["comment"], valueset=vs, contributed_datapoint=ld["contributor"]) done.add(id_) if not ld.get('bibsources'): if 'bibsources' not in ld: args.log.warn("no bibsource %s" % ld) continue for k, _ in [ktfbib(bibsource) for bibsource in ld['bibsources'].split(",,,")]: common.ValueSetReference(valueset=vs, source=data['Source'][k]) DBSession.flush() #To CLDF cldf = {} for ld in ldps: parameter = data['Feature'][ld['feature_alphanumid']] language = data['ntsLanguage'][ld['language_id']] id_ = '%s-%s' % (parameter.id, language.id) if not id_ in done: continue dt = (lgs[ld['language_id']], ld['language_id'], ld['feature_alphanumid'] + ". " + ld['feature_name'], ld["value"]) #, ld["comment"], ld["source"], bibliographical_details(ld.get('bibsources', "").split(",,,")) cldf[dt] = None tab = lambda rows: u''.join([u'\t'.join(row) + u"\n" for row in rows]) savu(tab([("Language", "iso-639-3", "Feature", "Value")] + cldf.keys()), "nts.cldf", encoding = "utf-8") #utf-16 "Comment", "Source", "Bibliographical Details" #cldf = {} #for ld in ldps: # parameter = data['Feature'][ld['feature_alphanumid']] # language = data['ntsLanguage'][ld['language_id']] # id_ = '%s-%s' % (parameter.id, language.id) # if not id_ in done: # continue # dt = (lgs[ld['language_id']], ld['language_id'], ld['feature_alphanumid'] + ". " + ld['feature_name'], ld["value"], ld["comment"], ld["source"], bibliographical_details(ld.get('bibsources', "").split(",,,")), ld.get("feature_information", ""), ld.get('feature_possible_values', ""), ld["designer"], ld.get("abbreviation", ""), ld["feature_domain"], ld.get('francais', ""), ld.get("dependencies", ""), ld.get("draft of clarifying comments to outsiders (hedvig + dunn + harald + suzanne)", "")) # cldf[dt] = None #savu(tab([("Language", "iso-639-3", "Feature", "Value", "Comment", "Source", "Bibliographical Details", "Feature Information", "Feature Possible Values", "Feature Designer", "Feature Abbreviation", "Feature Domain", "Feature (French)", "Feature Dependencies", "Feature Clarifying Comments")] + cldf.keys()), "nts-with-metadata.tsv", encoding="utf-16") args.log.info('%s Errors' % len(errors)) dataset = common.Dataset( id="NTS", name='Nijmegen Typological Survey', publisher_name="Max Planck Institute for Psycholinguistics", publisher_place="Nijmegen", publisher_url="http://www.mpi.nl", description="""Dataset on Typological Features, collected 2013-2014 in the Language and Cognition Department at the Max Planck Institute for Psycholinguistics, Max-Planck Gesellschaft, and a European Research Council's Advanced Grant (269484 "INTERACT") to Stephen C. Levinson.""", domain='http://nts.clld.org', published=date(2014, 2, 20), contact='*****@*****.**', license='http://creativecommons.org/licenses/by-nc-nd/2.0/de/deed.en', jsondata={ 'license_icon': 'http://wals.info/static/images/cc_by_nc_nd.png', 'license_name': 'Creative Commons Attribution-NonCommercial-NoDerivs 2.0 Germany'}) for i, contributor in enumerate([ common.Contributor( id="Harald Hammarstrom", name="Harald Hammarstrom", email="*****@*****.**"), common.Contributor( id="Suzanne van der Meer", name="Suzanne van der Meer", email="*****@*****.**"), common.Contributor( id="Hedvig Skirgard", name="Hedvig Skirgard", email="*****@*****.**") ]): common.Editor(dataset=dataset, contributor=contributor, ord=i) DBSession.add(dataset)
def bibliographical_details(bibsources): ktfs = [ktfbib(bibsource) for bibsource in bibsources if bibsource.strip()] return u"; ".join([Record(t, k, **{k: bibtex.unescape(v) for (k, v) in f.iteritems()}).text() for (k, (t, f)) in ktfs])
def get_contributors(rec, data): for author in re.split('\s+and\s+', unescape(rec['author'])): for cid, obj in data['Member'].items(): if fuzz.token_sort_ratio(author, obj.name) >= 92: yield cid
def main(args): data = Data(created=utc.localize(datetime(2013, 11, 15)), updated=utc.localize(datetime(2013, 12, 12))) icons = issues.Icons() #print icons DBSession.execute("delete from Language") DBSession.execute("delete from Unit") DBSession.execute("delete from featuredomain") DBSession.execute("delete from family") DBSession.execute("delete from source") DBSession.execute("delete from parameter") DBSession.execute("delete from feature") DBSession.execute("delete from domainelement") DBSession.execute("delete from valueset") DBSession.execute("delete from value") DBSession.execute("delete from lsivalue") DBSession.execute("delete from dataset") DBSession.execute("delete from contributor") DBSession.execute("delete from lsilanguage") DBSession.execute("delete from contribution") DBSession.execute("delete from designer") DBSession.flush() dtab = partial(_dtab, args.data_file()) #Languages #print args.data_file() #tabfns = ['%s' % fn.basename() for fn in args.data_file().files('nts_*.tab')] #tabfns = ['nts_18.tab'] ##tabfns = os.listdir('/Users/virk/shafqat/postDoc-Swe/project/clld/clld/lsi/lsi/data')[1:] tabfns = os.listdir( '/Users/virk/shafqat/postDoc-Swe/project/clld/clld/lsi/lsi/lsi_data' )[1:] #print tabfns args.log.info("Sheets found: %s" % tabfns) ldps = [] lgs = {} nfeatures = Counter() nlgs = Counter() for fn in tabfns: for ld in dtab(fn): if ld['language_id'] == 'qgr' or ld['language_id'] == '---' or ld[ 'language_id'] == '': # to exclude languages which do not have an iso-code continue if "feature_alphanumid" not in ld: args.log.info("NO FEATUREID %s %s" % (len(ld), ld)) if not ld["feature_alphanumid"].startswith("DRS") \ and ld["feature_alphanumid"].find(".") == -1: ldps.append(dp_dict(ld)) ##print ld lgs[ld['language_id']] = unescape(ld['language_name']) if ld["value"] != "?": nfeatures.update([ld['language_id']]) nlgs.update([ld['feature_alphanumid']]) ldps = sorted(ldps, key=lambda d: d['feature_alphanumid']) #lgs["ygr"] = "Hua" for lgid, lgname in lgs.items(): data.add(models.lsiLanguage, lgid, id=lgid, name=lgname, representation=nfeatures.get(lgid, 0)) DBSession.flush() #print "I am here" #print data['ntsLanguage'].values()[1].id load_families( data, ##[(NOCODE_TO_GLOTTOCODE.get(l.id, l.id), l) for l in data['lsiLanguage'].values()], [(NOCODE_TO_GLOTTOCODE.get(l.id, l.id), l) for l in data['lsiLanguage'].values() if l.id != '---' and l.id != ''], isolates_icon='tcccccc') #print 'family' #print data['Family'].get('sino1245').jsondata #Domains for domain in set(ld['feature_domain'] for ld in ldps): data.add(models.FeatureDomain, domain, name=domain) DBSession.flush() #Designers #for i, info in enumerate(dtab("ntscontributions.tab") + dtab("ntscontacts.tab")): for i, info in enumerate([{ 'designer': 'shafqat', 'domain': '', 'pdflink': '', 'citation': '' }, { 'designer': '-', 'domain': '', 'pdflink': '', 'citation': '' }]): designer_id = str(i + 1) data.add(models.Designer, info['designer'], id=designer_id, name=designer_id, domain=info["domain"], contributor=info['designer'], pdflink=info["pdflink"], citation=info["citation"]) DBSession.flush() #Sources '''for k, (typ, bibdata) in [ ktfbib(bibsource) for ld in ldps if ld.get(u'bibsources') for bibsource in ld['bibsources'].split(",,,") ]: if k not in data["Source"]: data.add(common.Source, k, _obj=bibtex2source(Record(typ, k, **bibdata))) DBSession.flush()''' #Features fs = [(fid, mergeds(lds)) for fid, lds in groupby(ldps, key=lambda d: d['feature_alphanumid'])] fvdesc = [(fid, [ (ld.get("feature_possible_values"), ld.get("fromfile")) for ld in lds if ld.get("feature_possible_values") ]) for fid, lds in groupby(ldps, key=lambda d: d['feature_alphanumid'])] fvdt = [(fid, grp2(vdescs)) for (fid, vdescs) in fvdesc] fvmis = [(fid, vdescs) for (fid, vdescs) in fvdt if len(vdescs) > 1] for _, dfsids in groupby(sorted( (f.get('feature_name', fid), fid) for fid, f in fs), key=lambda t: t[0]): ##print [(k,v) for (k,v) in list(dfsids)],len(list(dfsids)) assert len(list(dfsids)) == 1 #print 'here is nlgs' for fid, f in fs: #print "lang name" #print ldps #print f.get('feature_possible_values', ""), if not fid.isdigit(): args.log.info("NO INT FID %s" % f) feature = data.add( models.Feature, fid, id=fid, name=f.get('feature_name', f['feature_alphanumid']), doc=f.get('feature_information', ""), vdoc=f.get('feature_possible_values', ""), representation=nlgs.get(fid, 0), designer=data["Designer"][f['designer']], dependson=f.get("depends_on", ""), abbreviation=f.get("abbreviation", ""), featuredomain=data['FeatureDomain'][f["feature_domain"]], name_french=f.get('francais', ""), clarification=f.get( "draft of clarifying comments to outsiders (hedvig + dunn + harald + suzanne)", ""), alternative_id=f.get("old feature number", ""), jl_relevant_unit=f.get("relevant unit(s)", ""), jl_function=f.get("function", ""), jl_formal_means=f.get("formal means", ""), sortkey_str="", sortkey_int=int(fid)) vdesclist = [veq.split("==") for veq in feature.vdoc.split("||")] vdesc = {v.replace(".", "-"): desc for [v, desc] in vdesclist} ##vdesc = {fmly+val2icon(v): desc for ((v,desc),fmly) in itertools.product([(vv,desc) for [vv, desc] in vdesclist],['c','d','f','t'])} vdesc.setdefault('?', 'Not known') if 'N/A' not in vdesc and feature.dependson: vdesc["N/A"] = "Not Applicable" vi = {v: i for (i, v) in enumerate(sorted(vdesc.keys()))} ##vicons = {f+val2icon(v):f+val2icon(v) for (v,f) in itertools.product(['0','1','2','3'],['c','d','f','t'])} ##vicons['?'] = 'c00ffff' ##vicons['N/A'] = 'c00ffff' ##vicons = icons.iconize(vi.keys()) for (v, desc) in vdesc.items(): #print v,vicons[v] data.add(common.DomainElement, (fid, v), id='%s-%s' % (fid, v), name=v, description=desc, jsondata={"icon": Colors[v]}, number=vi[v], parameter=feature) DBSession.flush() for ((f, lg), ixs) in grp2([((ld['feature_alphanumid'], ld['language_id']), i) for i, ld in enumerate(ldps)]): ixvs = set([ldps[ix]['value'] for ix in ixs]) if len(ixvs) == 1: continue args.log.warn("Dup value %s %s %s" % (f, lg, [(ldps[ix]['value'], ldps[ix]['fromfile']) for ix in ixs])) ##print "Dup value %s %s %s" % (f, lg, [(ldps[ix]['value'], ldps[ix]['fromfile'], ldps[ix].get('provenance')) for ix in ixs]) errors = {} done = set() glottolog = Glottolog() for ld in ldps: ############################### for printing different map markers for different familys for features:shafqat #print data['Family'] language = data['lsiLanguage'][ld['language_id']] if isinstance(language, (tuple, list)) and len(language) == 2: code, language = language else: code = language.id if code != '-': gl_language = glottolog.languoid(code) if gl_language: gl_family = gl_language.family if gl_family: family = data['Family'].get(gl_family.id) ##ld['value'] = ld['value']+'-'+str(family) ##ld['value'] = combineValueFamily(ld['value'],str(family)) #print family ##################################### parameter = data['Feature'][ld['feature_alphanumid']] language = data['lsiLanguage'][ld['language_id']] id_ = '%s-%s' % (parameter.id, language.id) if id_ in done: continue if (ld['feature_alphanumid'], ld['value']) not in data['DomainElement']: if not ld["value"].strip(): continue info = (ld['feature_alphanumid'], ld.get('feature_name', "[Feature Name Lacking]"), ld['language_id'], ld['value'], ld['fromfile']) msg = u"%s %s %s %s %s not in the set of legal values ({0})" % info args.log.error( msg.format( sorted([ y for (x, y) in data['DomainElement'].keys() if x == ld['feature_alphanumid'] ]))) ##print msg.format(sorted( ## [y for (x, y) in data['DomainElement'].keys() ## if x == ld['feature_alphanumid']])) errors[(ld['feature_alphanumid'], ld['language_id'])] = info continue vs = common.ValueSet( id=id_, language=language, parameter=parameter, source=ld["source"] or None, ##contribution=parameter.designer ) #print #print "this one" #print ld['value'],family models.lsiValue( id=id_, domainelement=data['DomainElement'][(ld['feature_alphanumid'], ld['value'])], jsondata={ "icon": data['DomainElement'][(ld['feature_alphanumid'], ld['value'])].jsondata, "family": FamilyCodes[str(family)] }, comment=ld["comment"], valueset=vs, contributed_datapoint=ld["contributor"]) done.add(id_) '''if not ld.get('bibsources'): if 'bibsources' not in ld: args.log.warn("no bibsource %s" % ld) continue for k, _ in [ktfbib(bibsource) for bibsource in ld['bibsources'].split(",,,")]: common.ValueSetReference(valueset=vs, source=data['Source'][k])''' DBSession.flush() #To CLDF cldf = {} for ld in ldps: parameter = data['Feature'][ld['feature_alphanumid']] language = data['lsiLanguage'][ld['language_id']] id_ = '%s-%s' % (parameter.id, language.id) if not id_ in done: continue dt = (lgs[ld['language_id']], ld['language_id'], ld['feature_alphanumid'] + ". " + ld['feature_name'], ld["value"]) cldf[dt] = None tab = lambda rows: u''.join([u'\t'.join(row) + u"\n" for row in rows]) savu(tab([("Language", "iso-639-3", "Feature", "Value")] + cldf.keys()), "lsi.cldf") args.log.info('%s Errors' % len(errors)) dataset = common.Dataset( id="LSI", name='Linguistic Survey of India', publisher_name="Sprakbanken", publisher_place="Gothenburg", publisher_url="to be given", description="this is to be followed", domain='http://lsi.clld.org', published=date(2016, 05, 16), contact='*****@*****.**', license='http://creativecommons.org/licenses/by-nc-nd/2.0/de/deed.en', jsondata={ 'license_icon': 'http://wals.info/static/images/cc_by_nc_nd.png', 'license_name': 'Creative Commons Attribution-NonCommercial-NoDerivs 2.0 Germany' }) # disabled for experimental purposes, names were appearing multiple times for i, contributor in enumerate([ common.Contributor(id="Lars Borin", name="Lars Borin", email="*****@*****.**"), common.Contributor(id="Shafqat Mumtaz Virk", name="Shafqat Mumtaz Virk", email="*****@*****.**"), common.Contributor(id="Anju Saxena", name="Anju Saxena", email="*****@*****.**"), common.Contributor(id="Harald Hammarstrom", name="Harald Hammarstrom", email="*****@*****.**") ]): #print i common.Editor(dataset=dataset, contributor=contributor, ord=i) '''cont1 = common.Contributor( id="Harald Hammarstrom", name="Harald Hammarstrom", email="*****@*****.**") cont2= common.Contributor( id="Shafqat Mumtaz Virk", name="Shafqat Mumtaz Virk", email="*****@*****.**") cont3 = common.Contributor( id="Lars Borin", name="Lars Borin", email="*****@*****.**") for contributor in [cont1,cont2,cont3]: common.Editor(dataset=dataset, contributor=contributor,ord=1)''' DBSession.add(dataset) DBSession.flush()
def test_unescape(self): from clld.lib.bibtex import unescape assert unescape(r"\ss ") == 'ß'