def main(args): glottocodes = {} if getuser() == "robert": glottocodes = glottocodes_by_isocode("postgresql://robert@/glottolog3") data = Data() dataset = common.Dataset(id=autotyp.__name__, name="AUTOTYP", description="AUTOTYP", domain="autotyp.clld.org") DBSession.add(dataset) bib = Database.from_file(args.data_file("LenaBib.bib"), lowercase=True) for i, spec in enumerate( [ ("bickel", "Balthasar Bickel", "University of Zurich"), ("nichols", "Johanna Nichols", "University of California, Berkeley"), ] ): contributor = data.add(common.Contributor, spec[0], id=spec[0], name=spec[1]) DBSession.add(common.Editor(dataset=dataset, ord=i + 1, contributor=contributor)) for l in rows( args.data_file("backbone_09Jan2014_directexport.tab"), newline="\r", encoding="macroman", namedtuples=True ): # LID language ISO639.3.2013 stock continent area latitude longitude if l.stock not in data["Stock"]: stock = data.add(models.Stock, l.stock, id=slug(l.stock), name=l.stock) else: stock = data["Stock"][l.stock] if l.continent not in data["Continent"]: continent = data.add(models.Continent, l.continent, id=slug(l.continent), name=l.continent) else: continent = data["Continent"][l.continent] if l.area not in data["Area"]: area = data.add(models.Area, l.area, id=slug(l.area), name=l.area, continent=continent) else: area = data["Area"][l.area] lang = data.add( models.Languoid, l.LID, id=l.LID, name=l.language, latitude=coord(l.latitude), longitude=coord(l.longitude), stock=stock, area=area, ) add_language_codes(data, lang, l.ISO639_3_2013, glottocodes=glottocodes) loader.case_alignment(args, data, bib) loader.inclusive_excusive(args, data, bib)
def main(args): citations.main(args) data = Data() pairs = {} languages = {} coords = {} for lang in dsv.rows( args.data_file('MB_Map_Data_Aug13WLabels'), namedtuples=True, newline='\n', encoding='latin1' ): coords[slug(lang.Label.split('<')[0].strip())] = ( float(lang.y), float(lang.x)) xls = xlrd.open_workbook(args.data_file('MB_BoCatSum_AFBO.xlsx')) matrix = xls.sheet_by_name('MB_BoCatSum_AFBO.txt') md = "area\trecipient language iso\trecipient language genus\tdonor language iso\tdonor language genus".split('\t') fields = [] params = [] for i in range(matrix.ncols): colname = xlrd.colname(i) if len(colname) == 2 and colname > 'BE': break colval = matrix.cell(0, i).value.strip() if (len(colname) == 1 and colname > 'G') or (len(colname) == 2 and colname < 'AY'): params.append(colval) fields.append(colval) else: fields.append(colval.lower()) for f in fields: if fields.count(f) > 1: print(f) assert len(fields) == len(set(fields)) for j in range(1, matrix.nrows): values = dict(zip(fields, [matrix.cell(j, i).value for i in range(matrix.ncols)])) try: id_ = int(values['perm.id']) except: continue pairs[id_] = values for type_ in ['recipient', 'donor']: languages[values[type_ + ' language'].strip()] = { 'macroarea': values['area']} for md in ['iso', 'genus']: languages[values[type_ + ' language'].strip()][md] \ = values['%s language %s' % (type_, md)] for name in COORDS: assert name in languages sources = {} with open(args.data_file('MB_Case_List_with_links.html')) as fp: worddoc = fp.read() for m in re.finditer('\"__(?P<recid>[^_]+)__\"', worddoc): sources[m.group('recid').decode('utf8')] = 1 soup = bs(worddoc) doc = {} cols = [] table = soup.find('table') for tr in table.children: if tr.name != 'tr': continue tds = filter(lambda n: n.name == 'td', tr.children) if not cols: cols = map(text, tds) else: values = dict(zip(cols, tds)) try: id_ = int(text(values['perm.id'])) doc[id_] = values if id_ in pairs: assert doc['Recipient lg.'] == pairs[id_][1]['recipient language'] assert doc['Don'] == pairs[id_][1]['donor language'] except: continue dataset = common.Dataset( id='afbo', name="AfBo: A world-wide survey of affix borrowing", contact="*****@*****.**", domain="afbo.info", license='http://creativecommons.org/licenses/by/3.0/', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 3.0 Unported License'}) DBSession.add(dataset) for i, spec in enumerate([('seifart', "Frank Seifart")]): DBSession.add(common.Editor( dataset=dataset, ord=i + 1, contributor=common.Contributor(id=spec[0], name=spec[1]))) contrib = data.add(common.Contribution, 'afbo', name="AfBo", id="afbo") iso_map = { ('ron', 'Meglenite Romanian'): ('ruq', None), ('fra', 'Norman French'): ('xno', None), ('tur', 'Turkic'): (None, 'turk1311'), ('xuu', 'Kxoe languages'): (None, 'khoe1241'), ('zoc', 'Zoquean languages'): (None, 'zoqu1261'), ('tzm', 'Moroccan Berber languages'): (None, 'atla1275'), ('cvn', 'Quechua'): ('qvn', None), ('rop', 'Gurindji Kriol'): (None, 'guri1249'), ('ita', 'Sicilian Italian'): ('scn', None), ('srp', 'Croatian'): ('hrv', None), ('eme', 'Wayampi‑Emerillon‑Zo’é'): (None, 'waya1271'), ('ale', 'Copper Island Aleut'): ('mud', None), ('car', 'intermediate Proto‑Carib'): (None, 'cari1283'), ('ell', 'Cappadocian Greek'): ('cpg', None), ('eng', 'Middle English'): ('enm', None), ('als', 'Arvanitic Albanian'): ('aat', None), ('nys', 'Northern Nyungic'): (None, 'dese1234'), ('ron', 'Istro‑Romanian'): ('ruo', None), ('chf', 'Cho’ol'): ('ctu', None), ('tuo', 'Eastern Tucanoan languages'): (None, 'east2698'), ('ceb', 'Visayan'): (None, 'bisa1268'), ('por', 'Sri Lanka Portuguese'): (None, 'mala1544'), ('brx', 'Tibeto-Burman languages'): (None, 'brah1260'), } with open('name_conflicts.tab', 'w') as fp: fp.write('iso\tafbo\tglottolog\tproposed iso\n') for i, name in enumerate(languages.keys()): md = languages[name] iso = md.pop('iso') if iso == 'cvn' and name == 'Quechua': iso = 'qvn' kw = dict(name=name, id=str(i+1), jsondata=md) if name in COORDS: kw['latitude'], kw['longitude'] = COORDS[name] elif slug(name) in coords: kw['latitude'], kw['longitude'] = coords[slug(name)] elif glottocoords.get(iso): kw['latitude'], kw['longitude'] = glottocoords[iso] if glottonames.get(iso) and slug(glottonames.get(iso)) != slug(name): fp.write(('%s\t%s\t%s\t%s\n' % ( iso, name, glottonames.get(iso), rglottonames.get(slug(name), ''))).encode('utf8')) if name == 'Meglenite Romanian': kw['name'] = 'Megleno Romanian' if not 'latitude' in kw: print(name) l = data.add(common.Language, name, **kw) iso, gc = iso_map.get((iso, name), (iso, None)) for code, type_ in [ (iso, common.IdentifierType.iso), (gc or glottocodes.get(iso), common.IdentifierType.glottolog) ]: if code: identifier = data.add( common.Identifier, code, id=code, name=code, type=type_.value) data.add( common.LanguageIdentifier, '%s-%s' % (code, l.id), identifier=identifier, language=l) include = sources.keys() + [ 'myersscottoncontact2002', 'myersscottonlanguage2007', 'meakinsborrowing2011', 'seifartprinciple2012', ] refdb = bibtex.Database.from_file(args.data_file('FSeifartZoteroLibrary14Nov2013.bib')) for rec in refdb: if slug(rec.id) in include: data.add(common.Source, slug(rec.id), _obj=bibtex2source(rec)) for i, name in enumerate(params): data.add(models.AffixFunction, name, id=str(i + 1), name=name) for id_, vd in pairs.items(): assert id_ in doc donor = data['Language'][vd['donor language'].strip()] recipient = data['Language'][vd['recipient language'].strip()] p = data.add( models.Pair, id_, id=str(id_), name=vd['pairs'].replace('Meglenite', 'Megleno'), area=recipient.jsondata['macroarea'], description=unicode(doc[id_]['comment']).replace('<h1', '<p').replace('</h1>', '</p>').replace('Meglenite', 'Megleno'), reliability=vd['reliability'], int_reliability=['high', 'mid', 'low'].index(vd['reliability']), count_interrel=int(vd[u'number of interrelated affixes']), count_borrowed=int(vd['number of borrowed affixes']), donor=donor, recipient=recipient) DBSession.flush() for i, param in enumerate(params): param_id = i + 1 value = vd[param] if value != '': vsid = '%s-%s' % (recipient.id, param_id) if vsid in data['ValueSet']: vs = data['ValueSet'][vsid] else: vs = data.add( common.ValueSet, vsid, id=vsid, parameter=data['AffixFunction'][param], language=recipient, contribution=contrib) data.add( models.waabValue, '%s-%s' % (id_, param_id), id='%s-%s' % (id_, param_id), pair=p, name='%s' % int(value), numeric=int(value), description='%s' % p, valueset=vs)
def main(args): active_only = not args.all coords = dict( (r[0], r[1:]) for r in dsv.rows(args.data_file('coordinates.tab'))) codes = dict((row[0], row[1]) for row in DBSession.execute( "select ll.hid, l.pk from languoid as ll, language as l where ll.pk = l.pk and ll.hid is not null" )) maxid = DBSession.execute( "select pk from languoid order by pk desc limit 1").fetchone()[0] gcs = {} lnames = {} for row in DBSession.execute("select pk, name from language"): lnames[row[0]] = row[1] # dict mapping branches (i.e. tuples of sub-family names) to dicts of H-languages families = OrderedDict() # dict mapping identifiers of H-languages to branches languages = OrderedDict() parse_families(args.data_file('lff.txt'), families, languages) # handle isolates / collapse families with exactly one leaf: isolate_names = {} collapsed_names = {} for key in families.keys(): if len(families[key]) == 1: if len(key) == 1: # isolate languages[families[key].keys()[0]][0] = None isolate_names[key[0]] = families[key].keys()[ 0] # map name to code else: languages[families[key].keys()[0]][0] = key[:-1] collapsed_names[key[-1]] = families[key].keys()[0] del families[key] # we also want to be able to lookup families by name names = {} for branch in families: name = branch[-1] if name in names: names[name].append(branch) else: names[name] = [branch] # now add the unclassifiabble, unattested, un-whatever parse_families(args.data_file('lof.txt'), families, languages) ncodes = {} languoids = [] for code in languages: if code not in codes: maxid += 1 ncodes[code] = maxid hnode, status, name, comment = languages[code] # we have to insert a new H-language! attrs = languoid( maxid, 'language', hid=code, id=glottocode(unicode(name), DBSession, gcs), name=name, hname=name, status=status, globalclassificationcomment=comment or None, ) print '++', attrs if coords.get(code): attrs['longitude'], attrs['latitude'] = map( float, coords.get(code)) languoids.append(attrs) urnodes = {} rnodes = {} for family in families: leafs = families[family] assert family[0] not in ['Speech Register', 'Spurious'] leafs = tuple( sorted(code for code in families[family].keys() if code in codes)) assert leafs if leafs in rnodes: # special case: there may be additional "Unclassified something" nodes in # branch without any changes in the set of leafs. assert [n for n in family if n.startswith('Unclassified')] fset, rset = set(family), set(rnodes[leafs]) assert rset.issubset(fset) assert leafs not in urnodes urnodes[leafs] = family #if len(family) > rnodes[leafs]: # rnodes[leafs] = family else: rnodes[leafs] = family # # at this point rnodes is a consolidated mapping of sets of H-Languages to branches in # the family tree. # # for set comparisons we compute a list of actual sets of leafs as well leafsets = [set(t) for t in sorted(rnodes.keys(), key=lambda s: len(s))] todo = [] # dict mapping (id, name, level) tuples for gl languoids of level family to tuples of leafs glnodes = {} # # note: all languoids with level null have children, thus are not dialects! # sql = "select l.pk, l.name, ll.level, ll.father_pk from languoid as ll, language as l where ll.pk = l.pk and ll.level = 'family' or ll.level is null" if active_only: sql = "select l.pk, l.name, ll.level, ll.father_pk from languoid as ll, language as l where ll.pk = l.pk and ll.level = 'family' and l.active = true" for row in DBSession.execute(sql).fetchall(): leafs = [ r[0] for r in DBSession.execute( "select distinct l.hid from treeclosuretable as t, languoid as l where t.child_pk = l.pk and t.parent_pk = %s and l.hid is not null and l.status != 'provisional'" % row[0]) ] if leafs: glnodes[(row[0], row[2], row[1], row[3])] = tuple(sorted(leafs)) else: # families without leafs will be marked as retired if row[1] in names and len(names[row[1]]) == 1: # unique family name, good enough for a match!? todo.append(Migration(row[0], None, pointer=names[row[1]][0])) else: todo.append(Migration(row[0], None)) # note: for legacy gl nodes, we map leaf-tuples to lists of matching nodes! rglnodes = {} for node, leafs in glnodes.items(): if leafs in rglnodes: rglnodes[leafs].append(node) else: rglnodes[leafs] = [node] # now we look for matches between old and new classification: for leafs, nodes in rglnodes.items(): assert leafs assert nodes todo.extend(match_nodes(leafs, nodes, rnodes, urnodes, leafsets, names)) # compile a mapping for exact matches: branch_to_pk = {} for m in todo: if m.hid: assert m.hid not in branch_to_pk branch_to_pk[m.hid] = m.pk new = 0 for hnode in sorted(families.keys(), key=lambda b: (len(b), b)): # loop through branches breadth first to determine what's to be inserted if hnode not in branch_to_pk: t = tuple(sorted(families[hnode].keys())) if t in rglnodes: # the "Unclassified subfamily" special case from above: assert [n for n in hnode if n.startswith('Unclassified')] # make sure, the existing glottolog family for the set of leafs is mapped # to some other node in the new classification: assert rglnodes[t][0][0] in [m.pk for m in todo if m.hid] maxid += 1 attrs = languoid( maxid, 'family', id=glottocode(unicode(hnode[-1]), DBSession, gcs), name=hnode[-1], hname=hnode[-1], ) branch_to_pk[hnode] = maxid lnames[maxid] = hnode[-1] if len(hnode) > 1: attrs['father_pk'] = branch_to_pk[tuple(list(hnode)[:-1])] assert attrs['father_pk'] print '++', attrs new += 1 languoids.append(attrs) # now on to the updates for families: matches, migrations, nomatches = 0, 0, 0 for m in todo: attrs = languoid(m.pk, 'family', name=lnames[m.pk]) if m.hid: #print '==', lnames[m.pk].encode('utf8'), '->', ', '.join(m.hid).encode('utf8') matches += 1 if len(m.hid) > 1: attrs['father_pk'] = branch_to_pk[tuple(list(m.hid)[:-1])] if getattr(m, 'rename', False): attrs['name'] = m.hid[-1] attrs['hname'] = m.hid[-1] else: attrs['active'] = False if getattr(m, 'pointer', False): print '~~', lnames[m.pk].encode('utf8'), '->', ', '.join( m.pointer).encode('utf8') migrations += 1 attrs['replacement'] = branch_to_pk[m.pointer] else: print '--', lnames[m.pk].encode('utf8'), '->' nomatches += 1 languoids.append(attrs) print matches, 'matches' print migrations, 'migrations' print nomatches, 'nomatches' print new, 'new nodes' risolate_names = dict(zip(isolate_names.values(), isolate_names.keys())) rcollapsed_names = dict( zip(collapsed_names.values(), collapsed_names.keys())) # and updates of father_pks for languages: for l in languages: hnode, status, name, comment = languages[l] id_ = codes.get(l, ncodes.get(l)) attrs = languoid(id_, 'language', status=status) if hnode: attrs['father_pk'] = branch_to_pk[hnode] attrs['globalclassificationcomment'] = comment or None # look for hnames! if l in risolate_names: attrs['hname'] = risolate_names[l] if l in rcollapsed_names: attrs['hname'] = rcollapsed_names[l] languoids.append(attrs) for row in DBSession.execute( "select l.pk, ll.hid, l.name from languoid as ll, language as l where ll.pk = l.pk and ll.hid like '%NOCODE_%'" ).fetchall(): if row[1] not in languages: # languoids with Harald's private code that are no longer in use attrs = languoid(row[0], 'language', status='retired', active=False, father_pk=None) languoids.append(attrs) with open(args.data_file('languoids.json'), 'w') as fp: json.dump(languoids, fp)
def main(args): active_only = not args.all coords = dict((r[0], r[1:]) for r in dsv.rows(args.data_file('coordinates.tab'))) codes = dict((row[0], row[1]) for row in DBSession.execute("select ll.hid, l.pk from languoid as ll, language as l where ll.pk = l.pk and ll.hid is not null")) maxid = DBSession.execute( "select pk from languoid order by pk desc limit 1").fetchone()[0] gcs = {} lnames = {} for row in DBSession.execute("select pk, name from language"): lnames[row[0]] = row[1] # dict mapping branches (i.e. tuples of sub-family names) to dicts of H-languages families = OrderedDict() # dict mapping identifiers of H-languages to branches languages = OrderedDict() parse_families(args.data_file('lff.txt'), families, languages) # handle isolates / collapse families with exactly one leaf: isolate_names = {} collapsed_names = {} for key in families.keys(): if len(families[key]) == 1: if len(key) == 1: # isolate languages[families[key].keys()[0]][0] = None isolate_names[key[0]] = families[key].keys()[0] # map name to code else: languages[families[key].keys()[0]][0] = key[:-1] collapsed_names[key[-1]] = families[key].keys()[0] del families[key] # we also want to be able to lookup families by name names = {} for branch in families: name = branch[-1] if name in names: names[name].append(branch) else: names[name] = [branch] # now add the unclassifiabble, unattested, un-whatever parse_families(args.data_file('lof.txt'), families, languages) ncodes = {} languoids = [] for code in languages: if code not in codes: maxid += 1 ncodes[code] = maxid hnode, status, name, comment = languages[code] # we have to insert a new H-language! attrs = languoid( maxid, 'language', hid=code, id=glottocode(unicode(name), DBSession, gcs), name=name, hname=name, status=status, globalclassificationcomment=comment or None, ) print '++', attrs if coords.get(code): attrs['longitude'], attrs['latitude'] = map(float, coords.get(code)) languoids.append(attrs) urnodes = {} rnodes = {} for family in families: leafs = families[family] assert family[0] not in ['Speech Register', 'Spurious'] leafs = tuple(sorted(code for code in families[family].keys() if code in codes)) assert leafs if leafs in rnodes: # special case: there may be additional "Unclassified something" nodes in # branch without any changes in the set of leafs. assert [n for n in family if n.startswith('Unclassified')] fset, rset = set(family), set(rnodes[leafs]) assert rset.issubset(fset) assert leafs not in urnodes urnodes[leafs] = family #if len(family) > rnodes[leafs]: # rnodes[leafs] = family else: rnodes[leafs] = family # # at this point rnodes is a consolidated mapping of sets of H-Languages to branches in # the family tree. # # for set comparisons we compute a list of actual sets of leafs as well leafsets = [set(t) for t in sorted(rnodes.keys(), key=lambda s: len(s))] todo = [] # dict mapping (id, name, level) tuples for gl languoids of level family to tuples of leafs glnodes = {} # # note: all languoids with level null have children, thus are not dialects! # sql = "select l.pk, l.name, ll.level, ll.father_pk from languoid as ll, language as l where ll.pk = l.pk and ll.level = 'family' or ll.level is null" if active_only: sql = "select l.pk, l.name, ll.level, ll.father_pk from languoid as ll, language as l where ll.pk = l.pk and ll.level = 'family' and l.active = true" for row in DBSession.execute(sql).fetchall(): leafs = [r[0] for r in DBSession.execute( "select distinct l.hid from treeclosuretable as t, languoid as l where t.child_pk = l.pk and t.parent_pk = %s and l.hid is not null and l.status != 'provisional'" % row[0])] if leafs: glnodes[(row[0], row[2], row[1], row[3])] = tuple(sorted(leafs)) else: # families without leafs will be marked as retired if row[1] in names and len(names[row[1]]) == 1: # unique family name, good enough for a match!? todo.append(Migration(row[0], None, pointer=names[row[1]][0])) else: todo.append(Migration(row[0], None)) # note: for legacy gl nodes, we map leaf-tuples to lists of matching nodes! rglnodes = {} for node, leafs in glnodes.items(): if leafs in rglnodes: rglnodes[leafs].append(node) else: rglnodes[leafs] = [node] # now we look for matches between old and new classification: for leafs, nodes in rglnodes.items(): assert leafs assert nodes todo.extend(match_nodes(leafs, nodes, rnodes, urnodes, leafsets, names)) # compile a mapping for exact matches: branch_to_pk = {} for m in todo: if m.hid: assert m.hid not in branch_to_pk branch_to_pk[m.hid] = m.pk new = 0 for hnode in sorted(families.keys(), key=lambda b: (len(b), b)): # loop through branches breadth first to determine what's to be inserted if hnode not in branch_to_pk: t = tuple(sorted(families[hnode].keys())) if t in rglnodes: # the "Unclassified subfamily" special case from above: assert [n for n in hnode if n.startswith('Unclassified')] # make sure, the existing glottolog family for the set of leafs is mapped # to some other node in the new classification: assert rglnodes[t][0][0] in [m.pk for m in todo if m.hid] maxid += 1 attrs = languoid( maxid, 'family', id=glottocode(unicode(hnode[-1]), DBSession, gcs), name=hnode[-1], hname=hnode[-1], ) branch_to_pk[hnode] = maxid lnames[maxid] = hnode[-1] if len(hnode) > 1: attrs['father_pk'] = branch_to_pk[tuple(list(hnode)[:-1])] assert attrs['father_pk'] print '++', attrs new += 1 languoids.append(attrs) # now on to the updates for families: matches, migrations, nomatches = 0, 0, 0 for m in todo: attrs = languoid(m.pk, 'family', name=lnames[m.pk]) if m.hid: #print '==', lnames[m.pk].encode('utf8'), '->', ', '.join(m.hid).encode('utf8') matches += 1 if len(m.hid) > 1: attrs['father_pk'] = branch_to_pk[tuple(list(m.hid)[:-1])] if getattr(m, 'rename', False): attrs['name'] = m.hid[-1] attrs['hname'] = m.hid[-1] else: attrs['active'] = False if getattr(m, 'pointer', False): print '~~', lnames[m.pk].encode('utf8'), '->', ', '.join(m.pointer).encode('utf8') migrations += 1 attrs['replacement'] = branch_to_pk[m.pointer] else: print '--', lnames[m.pk].encode('utf8'), '->' nomatches += 1 languoids.append(attrs) print matches, 'matches' print migrations, 'migrations' print nomatches, 'nomatches' print new, 'new nodes' risolate_names = dict(zip(isolate_names.values(), isolate_names.keys())) rcollapsed_names = dict(zip(collapsed_names.values(), collapsed_names.keys())) # and updates of father_pks for languages: for l in languages: hnode, status, name, comment = languages[l] id_ = codes.get(l, ncodes.get(l)) attrs = languoid(id_, 'language', status=status) if hnode: attrs['father_pk'] = branch_to_pk[hnode] attrs['globalclassificationcomment'] = comment or None # look for hnames! if l in risolate_names: attrs['hname'] = risolate_names[l] if l in rcollapsed_names: attrs['hname'] = rcollapsed_names[l] languoids.append(attrs) for row in DBSession.execute( "select l.pk, ll.hid, l.name from languoid as ll, language as l where ll.pk = l.pk and ll.hid like '%NOCODE_%'" ).fetchall(): if row[1] not in languages: # languoids with Harald's private code that are no longer in use attrs = languoid( row[0], 'language', status='retired', active=False, father_pk=None) languoids.append(attrs) with open(args.data_file('languoids.json'), 'w') as fp: json.dump(languoids, fp)
def inclusive_excusive(args, data, bib): """ Incl Inclusive/exclusive distinction. 1 = present, 0 = absent. Belh Belhare-type inclusive/exclusive distinction. 1 = present, 0 = absent. NA = no information available. MinAug Minimal/augmented system. 1 = present, 0 = absent. 1? = probably present """ value_map = { '0': 'absent', '1': 'present', '1?': 'probably present', 'NA': 'no information available'} name_map = OrderedDict() name_map['Incl'] = 'Inclusive/exclusive distinction' name_map['Belh'] = 'Belhare-type inclusive/exclusive distinction' name_map['MinAug'] = 'Minimal/augmented system' varspec = [(name, set()) for name in name_map.values()] rev_name_map = dict(zip(name_map.values(), name_map.keys())) p, contrib = param_and_contrib( data, 'inclusive/exclusive distinction', 'inclusive.exclusive', 2) DBSession.add(common.ContributionContributor( contribution=contrib, contributor=data['Contributor']['bickel'])) DBSession.add(common.ContributionContributor( contribution=contrib, contributor=data['Contributor']['nichols'])) allv = rows( args.data_file('InclExcl_ISO_bib_stripped.txt'), namedtuples=True, encoding='utf8', newline='\r') for lid, values in groupby(sorted(allv, key=lambda j: j.LID), lambda i: i.LID): vsid = '%s-%s' % (p.id, lid) values = list(values) if vsid not in data['ValueSet']: vs = data.add( common.ValueSet, vsid, id=vsid, language=data['Languoid'][lid], contribution=contrib, parameter=p) else: vs = data['ValueSet'][vsid] bibkeys = [] for v in values: bibkeys.extend(filter(None, [v.strip() for v in v.bibkey.split(',')])) for key in set(bibkeys): if key in data['Source']: source = data['Source'][key] else: if key in bib.keymap: source = data.add(common.Source, key, _obj=bibtex2source(bib[key])) else: print key source = None if source: DBSession.add(common.ValueSetReference(valueset=vs, source=source)) for i, value in enumerate(values): if i > 0: print 'multiuple values!' raise ValueError value_data = OrderedDict() for var in name_map.keys(): val = value_map.get(getattr(value, var)) if not val: print getattr(value, var) raise ValueError value_data[var] = val v = data.add( common.Value, vsid, id=vsid, name=' / '.join(value_data.values()), #jsondata=value, valueset=vs) DBSession.flush() for j, spec in enumerate(varspec): attr, domain = spec domain.add(value_data[rev_name_map[attr]]) DBSession.add(common.Value_data(key=attr, value=value_data[rev_name_map[attr]], ord=j, object_pk=v.pk)) p.jsondata = {'varspec': [(name, list(domain)) for name, domain in varspec]}