def handler(offset, batch): _values = [ value(start + offset + i + 1, row) for i, row in enumerate(batch) ] DBSession.execute(model.__table__.insert(), _values) values.extend(_values)
def update(args): count = 0 assert args.json iid = int(DBSession.execute( "select max(cast(id as integer)) from identifier").fetchone()[0]) + 1 pk = DBSession.execute( "select max(pk) from identifier").fetchone()[0] + 1 langs = {} for gid, name in args.json['wikipedia'].items(): if gid not in langs: langs[gid] = Languoid.get(gid) langs[gid].update_jsondata(wikipedia=name.split('/')[-1]) for gid, codes in args.json['multitree'].items(): l = langs[gid] lcodes = [i.name for i in l.identifiers if i.type == 'multitree'] for code in set(codes): if code not in lcodes: identifier = DBSession.query(common.Identifier)\ .filter(common.Identifier.type == 'multitree')\ .filter(common.Identifier.name == code)\ .first() if not identifier: identifier = common.Identifier( pk=pk, id=str(iid), name=code, type='multitree') iid += 1 pk += 1 count += 1 DBSession.add( common.LanguageIdentifier(language=l, identifier=identifier)) print count, 'new multitree identifiers'
def insert(db, table, model, value, start=0, order=None): log.info('migrating %s ...' % table) sql = 'select * from %s' % table values = [] if not order: log.info(sql) values = [ value(start + i + 1, row) for i, row in enumerate(db.execute(sql)) ] DBSession.execute(model.__table__.insert(), values) else: def handler(offset, batch): _values = [ value(start + offset + i + 1, row) for i, row in enumerate(batch) ] DBSession.execute(model.__table__.insert(), _values) values.extend(_values) order = [order] if isinstance(order, basestring) else order select(db, '%s order by %s' % (sql, ', '.join(order)), handler) DBSession.execute('COMMIT') log.info('... done') return values
def handler(offset, batch): svalues = [] rvalues = [] for row in batch: jsondata = json.loads(row['jsondata'] or "{}") jsondata['bibtexkey'] = row['bibtexkey'] dicts = { 's': dict(pk=row['id'], polymorphic_type='base', id=str(row['id']), name='%(author)s %(year)s' % row, description=row['title'], bibtex_type=getattr(EntryType, row['type']), jsondata=jsondata), 'r': dict(pk=row['id']), } for model, map_ in { 's': { 'author': None, 'yearstring': 'year', 'year': 'year_int', 'startpage': 'startpage_int', 'numberofpages': 'pages_int', 'pages': None, 'edition': None, 'school': None, 'address': None, 'url': None, 'note': None, 'number': None, 'series': None, 'editor': None, 'booktitle': None, 'journal': None, 'volume': None, 'publisher': None, }, 'r': { 'endpage': 'endpage_int', 'inlg': None, 'inlg_code': None, 'subject': None, 'subject_headings': None, 'keywords': None, 'normalizedauthorstring': None, 'normalizededitorstring': None, 'ozbib_id': None, } }.items(): for okey, nkey in map_.items(): dicts[model][nkey or okey] = row[okey] svalues.append(dicts['s']) rvalues.append(dicts['r']) DBSession.execute(common.Source.__table__.insert(), svalues) DBSession.execute(models2.Ref.__table__.insert(), rvalues)
def handler(offset, batch): svalues = [] rvalues = [] for row in batch: jsondata = json.loads(row['jsondata'] or "{}") jsondata['bibtexkey'] = row['bibtexkey'] dicts = { 's': dict( pk=row['id'], polymorphic_type='base', id=str(row['id']), name='%(author)s %(year)s' % row, description=row['title'], bibtex_type=getattr(EntryType, row['type']), jsondata=jsondata), 'r': dict(pk=row['id']), } for model, map_ in { 's': { 'author': None, 'yearstring': 'year', 'year': 'year_int', 'startpage': 'startpage_int', 'numberofpages': 'pages_int', 'pages': None, 'edition': None, 'school': None, 'address': None, 'url': None, 'note': None, 'number': None, 'series': None, 'editor': None, 'booktitle': None, 'journal': None, 'volume': None, 'publisher': None, }, 'r': { 'endpage': 'endpage_int', 'inlg': None, 'inlg_code': None, 'subject': None, 'subject_headings': None, 'keywords': None, 'normalizedauthorstring': None, 'normalizededitorstring': None, 'ozbib_id': None, } }.items(): for okey, nkey in map_.items(): dicts[model][nkey or okey] = row[okey] svalues.append(dicts['s']) rvalues.append(dicts['r']) DBSession.execute(common.Source.__table__.insert(), svalues) DBSession.execute(models2.Ref.__table__.insert(), rvalues)
def dataset_detail_html(context=None, request=None, **kw): """ #unique language names: 6895 #Ethnologue families: 223 #Glottolog families: 381 #languages with unique ISO codes: 4424 match [a-z]{3}! asjp=# select count(*) from (select distinct name from identifier where type = 'iso639-3') as s; -[ RECORD 1 ] count | 4401 #words in the database (not counting synonyms): 238976 and counting synonyms: ... """ stats = { 'wordlists': DBSession.query(common.Language.pk).count(), 'ethnologue_families': DBSession.query(models.Doculect.ethnologue_family) .distinct().count(), 'glottolog_families': DBSession.query(models.Doculect.glottolog_family) .distinct().count(), 'iso_langs': DBSession.query(common.Identifier.name) .filter(common.Identifier.type == common.IdentifierType.iso.value).distinct() .count(), 'synsets': DBSession.execute( 'select count(*) from (select distinct valueset_pk from value) as s') .fetchone()[0], 'words': DBSession.query(common.Value.pk).count(), } return stats
def dataset_detail_html(context=None, request=None, **kw): """ #unique language names: 6895 #Ethnologue families: 223 #Glottolog families: 381 #languages with unique ISO codes: 4424 match [a-z]{3}! asjp=# select count(*) from (select distinct name from identifier where type = 'iso639-3') as s; -[ RECORD 1 ] count | 4401 #words in the database (not counting synonyms): 238976 and counting synonyms: ... """ stats = { 'wordlists': DBSession.query(common.Language.pk).count(), 'ethnologue_families': DBSession.query(models.Doculect.ethnologue_family) .distinct().count(), 'glottolog_families': DBSession.query(models.Doculect.glottolog_family) .distinct().count(), 'iso_langs': DBSession.query(common.Identifier.name) .filter(common.Identifier.type == common.IdentifierType.iso.value).distinct() .count(), 'synsets': DBSession.execute( 'select count(*) from (select distinct valueset_pk from value) as s') .fetchone()[0], 'words': DBSession.query(common.Value.pk).count(), 'missing_iso': len(missing_iso()), } return {k: '{0:,}'.format(n) for k, n in stats.items()}
def dataset_detail_html(context=None, request=None, **kw): res = dict((row[0], row[1]) for row in DBSession.execute("select source, count(pk) from inventory group by source")) res["inventory_count"] = DBSession.query(Inventory).count() res["segment_count"] = DBSession.query(Parameter).count() res["language_count"] = DBSession.query(Language).count() res["contributors"] = ( DBSession.query(Contributor) .order_by(Contributor.name) .options(joinedload(Contributor.contribution_assocs), joinedload(Contributor.references)) .all() ) res["sources"] = { k: Source.get(k) for k in [ "moisikesling2011", "ipa2005", "hayes2009", "moran2012a", "moranetal2012", "cysouwetal2012", "mccloyetal2013", ] } res["descriptions"] = {c.id: desc(request, c.description, res["sources"]) for c in res["contributors"]} return res
def extract_data(endangerment): # pragma: no cover status = {} lpks = DBSession.query(common.Language.pk) \ .filter(common.Language.active == True) \ .filter(common.Language.latitude != None) \ .filter(Languoid.level == LanguoidLevel.language) \ .order_by(common.Language.pk).all() print(len(lpks)) sql = """\ select ls.source_pk, count(ls.language_pk) from languagesource as ls, ref as r where ls.source_pk = r.pk and r.ca_doctype_trigger is null and r.ca_language_trigger is null group by source_pk """ lcounts = {r[0]: r[1] for r in DBSession.execute(sql)} # loop over active, established languages with geo-coords for i, lpk in enumerate(lpks): l = DBSession.query(common.Language).filter(common.Language.pk == lpk).one() # let's collect the relevant sources in a way that allows computation of med. # Note: we limit refs to the ones without computerized assignments. sources = list(DBSession.query(Ref).join(common.LanguageSource) \ .filter(common.LanguageSource.language_pk == lpk) \ .filter(Ref.ca_doctype_trigger == None) \ .filter(Ref.ca_language_trigger == None) \ .options(joinedload(Ref.doctypes))) sources = sorted([Source(s, lcounts.get(s.pk, 0)) for s in sources]) # keep the overall med # note: this source may not be included in the potential meds computed # below, # e.g. because it may not have a year. med = sources[0].__json__() if sources else None # now we have to compute meds respecting a cut-off year. # to do so, we collect eligible sources per year and then # take the med of this collection. potential_meds = [] # we only have to loop over publication years within all sources, because # only in these years something better might have come along. for year in set(s.year for s in sources if s.year): # let's see if something better was published! eligible = [s for s in sources if s.year and s.year <= year] if eligible: potential_meds.append(sorted(eligible)[0]) # we store the precomputed sources information as jsondata: status[l.id] = [ med, [s.__json__() for s in sorted(set(potential_meds), key=lambda s: -s.year)], endangerment.get(l.id, {}).get('source') ] if i and i % 1000 == 0: print(i) DBSession.close() return status
def prime_cache(args): # # add precalculated scores for meanings and semantic fields: # icons_dir = path(wold2.__file__).dirname().joinpath('static') for vocab in DBSession.query(models.Vocabulary): figure(figsize=(0.4, 0.4)) axes([0.1, 0.1, 0.8, 0.8]) coll = pie((100, ), colors=('#' + vocab.color, )) coll[0][0].set_linewidth(0.5) savefig(icons_dir.joinpath('%s.png' % vocab.color), transparent=True) words = DBSession.query(models.Word.borrowed_score)\ .join(common.Unit.language)\ .join(models.WoldLanguage.vocabulary)\ .filter(models.Vocabulary.pk == vocab.pk)\ .filter(models.Word.borrowed_score != None) vocab.count_words = words.count() vocab.borrowed_score = sum(score[0] for score in words) / float( vocab.count_words) vocab.count_core_list_counterparts = DBSession.query(models.Counterpart)\ .join(common.Value.valueset)\ .join(common.ValueSet.parameter)\ .filter(common.ValueSet.contribution_pk == vocab.pk)\ .filter(models.Meaning.core_list == True)\ .count() for meaning in DBSession.query(models.Meaning): meaning.representation = DBSession.query(models.Counterpart)\ .join(common.ValueSet)\ .filter(common.ValueSet.parameter_pk == meaning.pk)\ .count() for type_ in ['borrowed', 'age', 'simplicity']: attr = '%s_score' % type_ for row in DBSession.execute(models.score_per_meaning_query(type_)): meaning = DBSession.query(models.Meaning).get(row['meaning_pk']) setattr(meaning, attr, row[attr]) for row in DBSession.execute( models.score_per_semanticfield_query(type_)): sf = DBSession.query(models.SemanticField).get( row['semantic_field_pk']) setattr(sf, attr, row[1])
def select_leafs(pk): l, tc = Languoid.__table__.alias( 'l'), TreeClosureTable.__table__.alias('tc') return [ r['l_hid'] for r in DBSession.execute( select([l, tc], use_labels=True).where( and_(l.c.pk == tc.c.child_pk, l.c.hid != None, l.c.status != LanguoidStatus.provisional, tc.c.parent_pk == pk))) ]
def prime_cache(args): # # add precalculated scores for meanings and semantic fields: # icons_dir = path(wold2.__file__).dirname().joinpath('static') for vocab in DBSession.query(models.Vocabulary): figure(figsize=(0.4, 0.4)) axes([0.1, 0.1, 0.8, 0.8]) coll = pie((100,), colors=('#' + vocab.color,)) coll[0][0].set_linewidth(0.5) savefig(icons_dir.joinpath('%s.png' % vocab.color), transparent=True) words = DBSession.query(models.Word.borrowed_score)\ .join(common.Unit.language)\ .join(models.WoldLanguage.vocabulary)\ .filter(models.Vocabulary.pk == vocab.pk)\ .filter(models.Word.borrowed_score != None) vocab.count_words = words.count() vocab.borrowed_score = sum(score[0] for score in words) / float(vocab.count_words) vocab.count_core_list_counterparts = DBSession.query(models.Counterpart)\ .join(common.Value.valueset)\ .join(common.ValueSet.parameter)\ .filter(common.ValueSet.contribution_pk == vocab.pk)\ .filter(models.Meaning.core_list == True)\ .count() for meaning in DBSession.query(models.Meaning): meaning.representation = DBSession.query(models.Counterpart)\ .join(common.ValueSet)\ .filter(common.ValueSet.parameter_pk == meaning.pk)\ .count() for type_ in ['borrowed', 'age', 'simplicity']: attr = '%s_score' % type_ for row in DBSession.execute(models.score_per_meaning_query(type_)): meaning = DBSession.query(models.Meaning).get(row['meaning_pk']) setattr(meaning, attr, row[attr]) for row in DBSession.execute(models.score_per_semanticfield_query(type_)): sf = DBSession.query(models.SemanticField).get(row['semantic_field_pk']) setattr(sf, attr, row[1])
def prime_cache(args): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodiucally whenever data has been updated. """ DBSession.execute('delete from closuretable') SQL = ClosureTable.__table__.insert() # store a mapping of pk to father_pk for all languoids: father_map = {r[0]: r[1] for r in DBSession.execute('select pk, father_pk from treefeature')} # we compute the ancestry for each single languoid for pk, father_pk in father_map.items(): depth = 1 # now follow up the line of ancestors while father_pk: DBSession.execute(SQL, dict(child_pk=pk, parent_pk=father_pk, depth=depth)) depth += 1 father_pk = father_map[father_pk]
def insert(db, table, model, value, start=0, order=None): log.info('migrating %s ...' % table) sql = 'select * from %s' % table values = [] if not order: log.info(sql) values = [value(start + i + 1, row) for i, row in enumerate(db.execute(sql))] DBSession.execute(model.__table__.insert(), values) else: def handler(offset, batch): _values = [value(start + offset + i + 1, row) for i, row in enumerate(batch)] DBSession.execute(model.__table__.insert(), _values) values.extend(_values) order = [order] if isinstance(order, basestring) else order select(db, '%s order by %s' % (sql, ', '.join(order)), handler) DBSession.execute('COMMIT') log.info('... done') return values
def update(args): count = 0 assert args.json iid = int( DBSession.execute("select max(cast(id as integer)) from identifier"). fetchone()[0]) + 1 pk = DBSession.execute("select max(pk) from identifier").fetchone()[0] + 1 langs = {} for gid, name in args.json['wikipedia'].items(): if gid not in langs: langs[gid] = Languoid.get(gid) langs[gid].update_jsondata(wikipedia=name.split('/')[-1]) for gid, codes in args.json['multitree'].items(): l = langs[gid] lcodes = [i.name for i in l.identifiers if i.type == 'multitree'] for code in set(codes): if code not in lcodes: identifier = DBSession.query(common.Identifier)\ .filter(common.Identifier.type == 'multitree')\ .filter(common.Identifier.name == code)\ .first() if not identifier: identifier = common.Identifier(pk=pk, id=str(iid), name=code, type='multitree') iid += 1 pk += 1 count += 1 DBSession.add( common.LanguageIdentifier(language=l, identifier=identifier)) print count, 'new multitree identifiers'
def template_context(self, ctx, req): pairs = [(r[0], r[1]) for r in DBSession.execute(SQL.format(ctx.pk))] values = { v.pk: v for v in DBSession.query(Value) .filter(Value.pk.in_(list(chain(*pairs)))) .options( joinedload_all(Value.valueset, ValueSet.language, LexibankLanguage.family), joinedload(Value.valueset, ValueSet.parameter), joinedload(Value.valueset, ValueSet.contribution)) } distinct_families = defaultdict(set) for v1, v2 in pairs: distinct_families[values[v2].valueset.parameter_pk].add( values[v2].valueset.language.family_pk) return {'pairs': pairs, 'values': values, 'families': distinct_families}
def template_context(self, ctx, req): pairs = [(r[0], r[1]) for r in DBSession.execute(SQL.format(ctx.pk))] values = { v.pk: v for v in DBSession.query(Value) .filter(Value.pk.in_(list(chain(*pairs)))) .options( joinedload_all(Value.valueset, ValueSet.language, LexiRumahLanguage.family), joinedload(Value.valueset, ValueSet.parameter), joinedload(Value.valueset, ValueSet.contribution)) } distinct_families = defaultdict(set) for v1, v2 in pairs: distinct_families[values[v2].valueset.parameter_pk].add( values[v2].valueset.language.family_pk) return {'pairs': pairs, 'values': values, 'families': distinct_families}
def select_leafs(pk): l, tc = Languoid.__table__.alias("l"), TreeClosureTable.__table__.alias("tc") return [ r["l_hid"] for r in DBSession.execute( select([l, tc], use_labels=True).where( and_( l.c.pk == tc.c.child_pk, l.c.hid != None, l.c.status != LanguoidStatus.provisional, tc.c.parent_pk == pk, ) ) ) ]
def ldstatus(ppk): sql = """\ select l.id, v.domainelement_pk, vs.source, l.jsondata::json->>'meds' from language as l, languoid as ll, valueset as vs, value as v, parameter as p where l.jsondata::json->>'meds' is not null and l.pk = vs.language_pk and vs.parameter_pk = p.pk and v.valueset_pk = vs.pk and vs.parameter_pk = {0} and ll.pk = l.pk and ll.category in ('Spoken L1 Language', 'Sign Language') """.format(ppk) res = {} for lid, aespk, aes_source, meds in DBSession.execute(sql): meds = json.loads(meds) res[lid] = (aespk, meds[0] if meds else None, meds, aes_source) return res
def dataset_detail_html(context=None, request=None, **kw): res = dict( (row[0], row[1]) for row in DBSession.execute("select source, count(pk) from inventory group by source")) res['inventory_count'] = DBSession.query(Inventory).count() res['segment_count'] = DBSession.query(Parameter).count() res['language_count'] = DBSession.query(Language).count() res['contributors'] = DBSession.query(Contributor).order_by(Contributor.name).options( joinedload(Contributor.contribution_assocs), joinedload(Contributor.references)).all() res['sources'] = { k: Source.get(k) for k in ['moisikesling2011', 'ipa2005', 'hayes2009', 'moran2012a', 'moranetal2012', 'cysouwetal2012', 'mccloyetal2013']} res['descriptions'] = {c.id: desc(request, c.description, res['sources']) for c in res['contributors']} return res
def jqtree(self, icon_map=None): tree_ = [] children_map = {} children_of_self = [c.pk for c in self.children] for row in DBSession.execute("""\ select ll.father_pk, c.child_pk, l.id, l.name, l.latitude, ll.hid, ll.level, ll.status, ll.child_language_count, c.depth from treeclosuretable as c, language as l, languoid as ll, language as l2 where l.active is true and c.parent_pk = l2.pk and c.child_pk = l.pk and c.child_pk = ll.pk and c.parent_pk = %s order by l2.name, c.depth, l.name;""" % (self.family_pk or self.pk,)): fpk, cpk, id_, name, lat, hid, level, status, clc, depth = row if hid and len(hid) != 3: hid = None label = name if clc: label += ' (%s)' % clc #label = '%s [%s]' % (name, id_) #if level == 'language' and hid and len(hid) == 3: # label += '[%s]' % hid node = {'id': id_, 'pk': cpk, 'iso': hid, 'level': level, 'status': status, 'label': label, 'children': []} if icon_map and id_ == self.id and lat: node['map_marker'] = icon_map[cpk] if cpk in children_of_self: node['child'] = True if icon_map and (level == 'family' or lat): node['map_marker'] = icon_map[cpk] children_map[cpk] = node['children'] if not fpk: tree_.append(node) else: if fpk not in children_map: # this can be the case for dialects attached to inactive nodes continue children_map[fpk].append(node) return tree_
def dump(fn = "gbdump.tsv"): import io # dumpsql = """ #select l.id, p.id, v.name, v.description, s.name #from value as v, language as l, parameter as p, valueset as vs LEFT OUTER JOIN valuesetreference as vsref ON vsref.valueset_pk = vs.pk LEFT OUTER JOIN source as s ON vsref.source_pk = s.pk #where v.valueset_pk = vs.pk and vs.language_pk = l.pk and vs.parameter_pk = p.pk # """ #datatriples = grp2([((v[0], v[1], v[2], v[3] or ""), v[4] or "") for v in DBSession.execute(dumpsql)]) #dump = [xs + ("; ".join(refs),) for (xs, refs) in datatriples.iteritems()] dumpsql = """ select l.name, l.id, p.id, v.name, v.description, vs.source from value as v, language as l, parameter as p, valueset as vs where v.valueset_pk = vs.pk and vs.language_pk = l.pk and vs.parameter_pk = p.pk """ dump = [v for v in DBSession.execute(dumpsql)] tab = lambda rows: u''.join([u'\t'.join(row) + u"\n" for row in rows]) txt = tab([("Language_Name", "Language_ID", "Feature", "Value", "Comment", "Source")] + dump) with io.open(fn, 'w', encoding="utf-8") as fp: fp.write(txt)
def dataset_detail_html(context=None, request=None, **kw): stats = { 'wordlists': DBSession.query(common.Language.pk).count(), #'ethnologue_families': DBSession.query( # models.Doculect.ethnologue_family).distinct().count(), #'glottolog_families': DBSession.query( # models.Doculect.glottolog_family).distinct().count(), 'iso_langs': DBSession.query(common.Identifier.name).filter( common.Identifier.type == common.IdentifierType.iso.value).distinct().count(), 'synsets': DBSession.execute( 'select count(*) from (select distinct valueset_pk from ' 'value) as ' 's').fetchone()[0], 'words': DBSession.query(common.Value.pk).count(), # 'missing_iso': len(missing_iso()), } return {k: '{0:,}'.format(n) for k, n in stats.items()}
def _freeze(table, fpath): def conv(v, col): if v is None: return '' if isinstance(col.type, DeclEnumType): # pragma: no cover return v.value if isinstance(col.type, JSONEncodedDict): return json.dumps(v) if isinstance(v, (datetime, date)): return v.isoformat() return v keys = [col.name for col in table.columns] cols = {col.name: col for col in table.columns} rows = [keys] for row in DBSession.execute(select([table])): rows.append([conv(row[key], cols[key]) for key in keys]) if len(rows) > 1: with UnicodeWriter(fpath) as writer: writer.writerows(rows)
def prime(args): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodically whenever data has been updated. """ # # Now that we loaded all languoids and refs, we can compute the MED values. # meds = defaultdict(list) for lpk, spk, sid, sname, med_type, year, pages in DBSession.execute("""\ select l.pk, r.pk, s.id, s.name, r.med_type, s.year_int, r.med_pages from languagesource as ls, language as l, source as s, ref as r where ls.active = TRUE and l.pk = ls.language_pk and s.pk = ls.source_pk and s.pk = r.pk order by l.id, r.med_index desc, r.med_pages, coalesce(s.year_int, 0), s.pk """): meds[lpk].append((spk, sid, sname, med_type, year, pages)) # The last one is the overall MED # Now weed out the "newer but worse" sources: for lpk, sources in {k: reversed(v) for k, v in meds.items()}.items(): relevant, lastyear = [], 10000 for spk, sid, sname, med_type, year, pages in sources: if year and year < lastyear: # If year is more recent, this is a "newer but worse" item relevant.append((spk, sid, sname, med_type, year, pages)) lastyear = year meds[lpk] = relevant med_param = common.Parameter.get('med') med_domain = {de.id: de for de in med_param.domain} contrib = common.Contribution.get('glottolog') for l in DBSession.query(common.Language).filter(common.Language.pk.in_(list(meds.keys()))): l.update_jsondata(meds=[ (sid, med_type, year, pages, sname) for spk, sid, sname, med_type, year, pages in meds[l.pk]]) if not meds[l.pk]: continue med = meds[l.pk][0] # Record the overall MED as value for the 'med' Parameter: vs = common.ValueSet( id=idjoin('med', l.id), contribution=contrib, parameter=med_param, language=l, ) DBSession.add(common.Value( id=idjoin('med', l.id), name=getattr(args.repos.med_types, med[3]).name, domainelement=med_domain[idjoin('med', med[3])], valueset=vs, )) DBSession.flush() DBSession.add(common.ValueSetReference(source_pk=med[0], valueset_pk=vs.pk)) recreate_treeclosure() macroareas = {r[0]: (r[1], r[2]) for r in DBSession.execute("""\ select de.pk, de.id, de.name from domainelement as de, parameter as p where de.parameter_pk = p.pk and p.id = 'macroarea' """)} for lid, lpk, cpk, ppk, mas in DBSession.execute("""\ select l.id, l.pk, vs.contribution_pk, vs.parameter_pk, array_agg(distinct v.domainelement_pk) from language as l, treeclosuretable as t, parameter as p, valueset as vs, value as v where l.pk = t.parent_pk and t.child_pk = vs.language_pk and vs.parameter_pk = p.pk and p.id = 'macroarea' and v.valueset_pk = vs.pk and l.pk not in ( select language_pk from valueset as _vs, parameter as _p where _vs.parameter_pk = _p.pk and _p.id = 'macroarea' ) group by l.id, l.pk, vs.contribution_pk, vs.parameter_pk"""): for i, mapk in enumerate(mas): if i == 0: vs = common.ValueSet( id=idjoin('macroarea', lid), language_pk=lpk, parameter_pk=ppk, contribution_pk=cpk) DBSession.add(common.Value( id=idjoin(macroareas[mapk][0], lid), name=macroareas[mapk][1], domainelement_pk=mapk, valueset=vs)) for vs in DBSession.query(common.ValueSet)\ .join(common.Language)\ .join(common.Parameter)\ .filter(common.Parameter.id == 'macroarea')\ .options(joinedload(common.ValueSet.values), joinedload(common.ValueSet.language)): vs.language.macroareas = ', '.join([macroareas[v.domainelement_pk][1] for v in vs.values]) for row in list(DBSession.execute( "select pk, pages, pages_int, startpage_int from source where pages_int < 0" )): raise ValueError(row) version = assert_release(args.repos.repos) with jsonlib.update(gc2version(args), indent=4) as legacy: for lang in DBSession.query(common.Language): if lang.id not in legacy: lang.update_jsondata(new=True) legacy[lang.id] = version valuesets = { r[0]: r[1] for r in DBSession.query(common.ValueSet.id, common.ValueSet.pk)} refs = { r[0]: r[1] for r in DBSession.query(models.Refprovider.id, models.Refprovider.ref_pk)} for vsid, vspk in valuesets.items(): if vsid.startswith('macroarea-'): DBSession.add(common.ValueSetReference( source_pk=refs[args.repos.macroareas.__defaults__['reference_id']], valueset_pk=vspk)) for vs in DBSession.query(common.ValueSet)\ .join(common.Parameter)\ .filter(common.Parameter.id == 'aes'): if vs.jsondata['reference_id']: DBSession.add(common.ValueSetReference( source_pk=refs[vs.jsondata['reference_id']], valueset_pk=vs.pk)) for lang in args.repos.languoids(): if lang.category == args.repos.language_types.bookkeeping.category: continue clf = lang.classification_comment if clf: for pid, attr_ in [('sc', 'sub'), ('fc', 'family')]: if getattr(clf, attr_ + 'refs'): if split_items(lang.cfg['classification'][attr_ + 'refs']) != \ split_items(lang.cfg['classification'].get(attr_)): vspk = valuesets['{0}-{1}'.format(pid, lang.id)] for ref in getattr(clf, attr_ + 'refs'): spk = refs.get(ref.key) if spk: DBSession.add( common.ValueSetReference(source_pk=spk, valueset_pk=vspk))
def load(args): fts.index('fts_index', models.Ref.fts, DBSession.bind) DBSession.execute("CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;") dataset = common.Dataset( id='glottolog', name="Glottolog {0}".format(args.args[0]), publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="https://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='glottolog.org', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) data = Data() for i, (id_, name) in enumerate([ ('hammarstroem', 'Harald Hammarström'), ('bank', 'Sebastian Bank'), ('forkel', 'Robert Forkel'), ('haspelmath', 'Martin Haspelmath'), ]): ed = data.add(common.Contributor, id_, id=id_, name=name) common.Editor(dataset=dataset, contributor=ed, ord=i + 1) DBSession.add(dataset) clf = data.add(common.Contribution, 'clf', id='clf', name='Classification') DBSession.add(common.ContributionContributor( contribution=clf, contributor=data['Contributor']['hammarstroem'])) for pid, pname in [ ('fc', 'Family classification'), ('sc', 'Subclassification'), ('vitality', 'Degree of endangerment'), ]: data.add(common.Parameter, pid, id=pid, name=pname) legacy = jsonlib.load(gc2version(args)) for gc, version in legacy.items(): data.add(models.LegacyCode, gc, id=gc, version=version) glottolog = args.repos for ma in Macroarea: data.add( models.Macroarea, ma.name, id=ma.name, name=ma.value, description=ma.description) for country in glottolog.countries: data.add(models.Country, country.id, id=country.id, name=country.name) lgcodes, mas, countries, lgsources = {}, {}, {}, defaultdict(list) languoids = list(glottolog.languoids()) nodemap = {l.id: l for l in languoids} for lang in languoids: for ref in lang.sources: lgsources['{0.provider}#{0.bibkey}'.format(ref)].append(lang.id) load_languoid(data, lang, nodemap) mas[lang.id] = [ma.name for ma in lang.macroareas] countries[lang.id] = [c.id for c in lang.countries] lgcodes[lang.id] = lang.id if lang.hid: lgcodes[lang.hid] = lang.id if lang.iso: lgcodes[lang.iso] = lang.id for gc in glottolog.glottocodes: if gc not in data['Languoid'] and gc not in legacy: common.Config.add_replacement(gc, None, model=common.Language) for obj in jsonlib.load(glottolog.references_path('replacements.json')): common.Config.add_replacement( '{0}'.format(obj['id']), '{0}'.format(obj['replacement']) if obj['replacement'] else None, model=common.Source) DBSession.flush() for lid, maids in mas.items(): for ma in maids: DBSession.add(models.Languoidmacroarea( languoid_pk=data['Languoid'][lid].pk, macroarea_pk=data['Macroarea'][ma].pk)) for lid, cids in countries.items(): for cid in cids: DBSession.add(models.Languoidcountry( languoid_pk=data['Languoid'][lid].pk, country_pk=data['Country'][cid].pk)) for doctype in glottolog.hhtypes: data.add( models.Doctype, doctype.id, id=doctype.id, name=doctype.name, description=doctype.description, abbr=doctype.abbv, ord=doctype.rank) for bib in glottolog.bibfiles: data.add( models.Provider, bib.id, id=bib.id, name=bib.title, description=bib.description, abbr=bib.abbr, url=bib.url) DBSession.flush() s = time() for i, entry in enumerate( BibFile(glottolog.build_path('monster-utf8.bib')).iterentries()): if i % 10000 == 0: args.log.info('{0}: {1:.3}'.format(i, time() - s)) s = time() ref = load_ref(data, entry, lgcodes, lgsources) if 'macro_area' in entry.fields: for ma in split_text(entry.fields['macro_area'], separators=',;', strip=True): ma = 'North America' if ma == 'Middle America' else ma ma = Macroarea.get('Papunesia' if ma == 'Papua' else ma) DBSession.add(models.Refmacroarea( ref_pk=ref.pk, macroarea_pk=data['Macroarea'][ma.name].pk))
def create(args): args.log.info('starting migration ...') data = Data() db = create_engine('postgresql://robert@/glottolog2') with transaction.manager: sn = data.add(common.Contributor, 'sn', id='sn', name='Sebastian Nordhoff') hh = data.add(common.Contributor, 'hh', id='hh', name='Harald Hammarström') rf = data.add(common.Contributor, 'rf', id='rf', name='Robert Forkel', url="https://github.com/xrotwang") mh = data.add(common.Contributor, 'mh', id='mh', name='Martin Haspelmath') contrib = data.add(common.Contribution, 'c', id='classification', name='Classification') data.add(common.ContributionContributor, 'hh', contribution=contrib, contributor=hh) params = dict( fc=data.add(common.Parameter, 'fc', id='fc', name='Family classification'), sc=data.add(common.Parameter, 'sc', id='sc', name='Subclassification'), ) dataset = data.add( common.Dataset, 'd', id='glottolog', name='Glottolog 2.0', description='', published=datetime.date(2013, 8, 15), domain='glottolog.org', contact='*****@*****.**', jsondata={ 'license_icon': 'http://i.creativecommons.org/l/by-sa/3.0/88x31.png', 'license_name': 'Creative Commons Attribution-ShareAlike 3.0 Unported License' }) for i, ed in enumerate([sn, hh, rf, mh]): DBSession.add( common.Editor(dataset=dataset, contributor=ed, ord=i + 1)) valuesets = {} def create_languoid(row, father_pk=None): glottocode = { 'akun1242': 'akun1241' }.get(row['alnumcode'], row['alnumcode']) attrs = dict( pk=row['id'], id=glottocode, name=row['primaryname'], description=row['globalclassificationcomment'], level=getattr(models2.LanguoidLevel, row['level']), status=getattr(models2.LanguoidStatus, (row['status'] or '').replace(' ', '_'), None), father_pk=father_pk, created=row['updated'], jsondata={} if not row['hname'] else {'hname': row['hname']}, ) for attr in [ 'active', 'updated', 'hid', 'latitude', 'longitude', ]: attrs[attr] = row[attr] l = data.add(models2.Languoid, row['id'], **attrs) for type_ in params: id_ = '%s%s' % (type_, row['id']) vs = data.add(common.ValueSet, id_, id=id_, description=row['classificationcomment'] if type_ == 'fc' else row['subclassificationcomment'], language=l, parameter=params[type_], contribution=contrib) data.add(common.Value, id_, id=id_, name='%s - %s' % (row['level'], row['status']), valueset=vs) DBSession.flush() valuesets[id_] = vs.pk return str(row['id']) level = 0 parents = [ create_languoid(row) for row in db.execute( 'select * from languoidbase where father_id is null') ] while parents: args.log.info('level: %s' % level) level += 1 parents = [ create_languoid( row, father_pk=data['Languoid'][row['father_id']].pk) for row in db.execute( 'select * from languoidbase where father_id in (%s)' % ','.join(parents)) ] def handler(offset, batch): svalues = [] rvalues = [] for row in batch: jsondata = json.loads(row['jsondata'] or "{}") jsondata['bibtexkey'] = row['bibtexkey'] dicts = { 's': dict(pk=row['id'], polymorphic_type='base', id=str(row['id']), name='%(author)s %(year)s' % row, description=row['title'], bibtex_type=getattr(EntryType, row['type']), jsondata=jsondata), 'r': dict(pk=row['id']), } for model, map_ in { 's': { 'author': None, 'yearstring': 'year', 'year': 'year_int', 'startpage': 'startpage_int', 'numberofpages': 'pages_int', 'pages': None, 'edition': None, 'school': None, 'address': None, 'url': None, 'note': None, 'number': None, 'series': None, 'editor': None, 'booktitle': None, 'journal': None, 'volume': None, 'publisher': None, }, 'r': { 'endpage': 'endpage_int', 'inlg': None, 'inlg_code': None, 'subject': None, 'subject_headings': None, 'keywords': None, 'normalizedauthorstring': None, 'normalizededitorstring': None, 'ozbib_id': None, } }.items(): for okey, nkey in map_.items(): dicts[model][nkey or okey] = row[okey] svalues.append(dicts['s']) rvalues.append(dicts['r']) DBSession.execute(common.Source.__table__.insert(), svalues) DBSession.execute(models2.Ref.__table__.insert(), rvalues) select(db, 'select * from refbase order by id', handler) DBSession.execute('COMMIT') for table, model, value, order in [ ('macroarea', models2.Macroarea, lambda i, row: dict(pk=row['id'], id=slug(row['name']), name=row['name'], description=row['description']), None), ('country', models2.Country, lambda i, row: dict(pk=row['id'], id=row['alpha2'], name=row['name']), None), ('provider', models2.Provider, lambda i, row: dict(pk=row['id'], id=slug(row['name']), name=row['description'], description=row['comment'], abbr=row['abbr'], url=row['url'], refurl=row['refurl'], bibfield=row['bibfield']), None), ('doctype', models2.Doctype, lambda i, row: dict(pk=row['id'], id=slug(row['name']), abbr=row['abbr'], name=row['name'], description=row['description']), None), ('refprovider', models2.Refprovider, lambda i, row: dict( pk=i, provider_pk=row['provider_id'], ref_pk=row['refbase_id']), ('provider_id', 'refbase_id')), ('refdoctype', models2.Refdoctype, lambda i, row: dict( pk=i, doctype_pk=row['doctype_id'], ref_pk=row['refbase_id']), ('doctype_id', 'refbase_id')), ]: insert(db, table, model, value, order=order) names = dict( (int(d['id']), d['pk']) for d in insert(db, 'namebase', common.Identifier, lambda i, row: dict(pk=i, id=str(row['id']), name=row['namestring'], type='name', description=row['nameprovider'], lang=row['inlg'] if row['inlg'] and len(row['inlg']) <= 3 else 'en'), order='id')) codes = dict( (int(d['id']), d['pk']) for d in insert(db, 'codebase', common.Identifier, lambda i, row: dict(pk=i, id=str(row['id']), name=row['codestring'], type=common.IdentifierType.iso. value if row['codeprovider'] == 'ISO' else row['codeprovider']), start=len(names), order='id')) res = insert( db, 'nodecodes', common.LanguageIdentifier, lambda i, row: dict(pk=i, language_pk=row['languoidbase_id'], identifier_pk=codes[row['codebase_id']])) insert(db, 'nodenames', common.LanguageIdentifier, lambda i, row: dict(pk=i, language_pk=row['languoidbase_id'], identifier_pk=names[row['namebase_id']]), start=len(res)) for table, model, value in [ ('languoidmacroarea', models2.Languoidmacroarea, lambda i, row: dict(pk=i, languoid_pk=row['languoidbase_id'], macroarea_pk=row['macroarea_id'])), ('languoidcountry', models2.Languoidcountry, lambda i, row: dict(pk=i, languoid_pk=row['languoidbase_id'], country_pk=row['country_id'])), ('noderefs', common.LanguageSource, lambda i, row: dict(pk=i, language_pk=row['languoidbase_id'], source_pk=row['refbase_id'])), ('refmacroarea', models2.Refmacroarea, lambda i, row: dict( pk=i, macroarea_pk=row['macroarea_id'], ref_pk=row['refbase_id'])), ('refcountry', models2.Refcountry, lambda i, row: dict( pk=i, country_pk=row['country_id'], ref_pk=row['refbase_id'])), ('spuriousreplacements', models2.Superseded, lambda i, row: dict(pk=i, languoid_pk=row['languoidbase_id'], replacement_pk=row['replacement_id'], description=row['relation'])), ('justification', common.ValueSetReference, lambda i, row: dict( pk=i, valueset_pk=valuesets['%s%s' % ('fc' if row[ 'type'] == 'family' else 'sc', row['languoidbase_id'])], source_pk=row['refbase_id'], description=row['pages'])), ]: insert(db, table, model, value)
def main(args): fts.index('fts_index', Word.fts, DBSession.bind) DBSession.execute("CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;") data = Data() dataset = common.Dataset( id=dictionaria.__name__, name="Dictionaria", description="The Dictionary Journal", published=date(2017, 3, 30), contact='*****@*****.**', domain='dictionaria.clld.org', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="https://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) for i, (id_, name) in enumerate([ ('haspelmathmartin', 'Martin Haspelmath'), ('moselulrike', 'Ulrike Mosel'), ('stiebelsbarbara', 'Barbara Stiebels') ]): ed = data.add(common.Contributor, id_, id=id_, name=name) common.Editor(dataset=dataset, contributor=ed, ord=i + 1) DBSession.add(dataset) for id_, name in LGR_ABBRS.items(): DBSession.add(common.GlossAbbreviation(id=id_, name=name)) comparison_meanings = {} print('loading concepts ...') glosses = set() concepticon = Concepticon( REPOS.joinpath('..', '..', 'concepticon', 'concepticon-data')) if not args.no_concepts: for conceptset in concepticon.conceptsets.values(): if conceptset.gloss in glosses: continue glosses.add(conceptset.gloss) cm = data.add( ComparisonMeaning, conceptset.id, id=conceptset.id, name=conceptset.gloss.lower(), description=conceptset.definition, concepticon_url='http://concepticon.clld.org/parameters/%s' % conceptset.id) comparison_meanings[cm.id] = cm DBSession.flush() print('... done') comparison_meanings = {k: v.pk for k, v in comparison_meanings.items()} submissions = [] for submission in REPOS.joinpath( 'submissions-internal' if args.internal else 'submissions').glob('*'): if not submission.is_dir(): continue try: submission = Submission(submission) except ValueError: continue md = submission.md if md is None: continue if not md['date_published']: continue id_ = submission.id if args.dict and args.dict != id_ and args.dict != 'all': continue lmd = md['language'] props = md.get('properties', {}) props.setdefault('custom_fields', []) props['metalanguage_styles'] = {} for v, s in zip(props.get('metalanguages', {}).values(), ['success', 'info', 'warning', 'important']): props['metalanguage_styles'][v] = s props['custom_fields'] = ['lang-' + f if f in props['metalanguage_styles'] else f for f in props['custom_fields']] language = data['Variety'].get(lmd['glottocode']) if not language: language = data.add( Variety, lmd['glottocode'], id=lmd['glottocode'], name=lmd['name']) md['date_published'] = md['date_published'] or date.today().isoformat() if '-' not in md['date_published']: md['date_published'] = md['date_published'] + '-01-01' dictionary = data.add( Dictionary, id_, id=id_, number=md.get('number'), name=props.get('title', lmd['name'] + ' dictionary'), description=submission.description, language=language, published=date(*map(int, md['date_published'].split('-'))), jsondata=props) for i, spec in enumerate(md['authors']): if not isinstance(spec, dict): cname, address = spec, None spec = {} else: cname, address = spec['name'], spec.get('affiliation') name = HumanName(cname) cid = slug('%s%s' % (name.last, name.first)) contrib = data['Contributor'].get(cid) if not contrib: contrib = data.add( common.Contributor, cid, id=cid, name=cname, address=address, url=spec.get('url'), email=spec.get('email')) DBSession.add(common.ContributionContributor( ord=i + 1, primary=True, contributor=contrib, contribution=dictionary)) submissions.append((dictionary.id, language.id, submission)) transaction.commit() for did, lid, submission in submissions: #if submission.id != 'sidaama': # continue transaction.begin() print('loading %s ...' % submission.id) dictdata = Data() lang = Variety.get(lid) submission.load_examples(Dictionary.get(did), dictdata, lang) submission.dictionary.load( submission, dictdata, Dictionary.get(did), lang, comparison_meanings, OrderedDict(submission.md.get('properties', {}).get('labels', []))) transaction.commit() print('... done') transaction.begin() load_families( Data(), [v for v in DBSession.query(Variety) if re.match('[a-z]{4}[0-9]{4}', v.id)], glottolog_repos='../../glottolog3/glottolog')
def prime_cache(args): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodically whenever data has been updated. """ compute_language_sources() return from time import time _s = time() def checkpoint(s, msg=None): n = time() print(n - s, msg or '') return n sql = """ select p.id, l.id, v.name from value as v, valueset as vs, language as l, parameter as p where v.valueset_pk = vs.pk and vs.language_pk = l.pk and vs.parameter_pk = p.pk """ datatriples = [(v[0], v[1], v[2]) for v in DBSession.execute(sql)] _s = checkpoint(_s, '%s values loaded' % len(datatriples)) flv = dict([(feature, dict(lvs)) for (feature, lvs) in grp(datatriples).items()]) _s = checkpoint(_s, 'triples grouped') clfps = list(get_clf_paths([row[0] for row in DBSession.execute("select id from language")])) _s = checkpoint(_s, '%s clfps loaded' % len(clfps)) features = {f.id: f for f in DBSession.query(Feature)} for (f, lv) in flv.items(): features[f].representation = len(lv) DBSession.flush() _s = checkpoint(_s, 'representation assigned') families = {f.id: f for f in DBSession.query(Family)} if False: fs = feature_stability(datatriples, clfps) _s = checkpoint(_s, 'feature_stability computed') for (f, (s, transitions, stationarity_p, synchronic_p)) in fs: print(f) stability = Stability( id=f.replace("GB", "S"), feature=features[f], parsimony_stability_value=s["stability"], parsimony_retentions=s["retentions"], parsimony_transitions=s["transitions"], jsondata={'diachronic_p': stationarity_p, "synchronic_p": synchronic_p}) DBSession.add(stability) for (i, (fam, (fromnode, tonode), (ft, tt))) in enumerate(transitions): DBSession.add(Transition( id="%s: %s->%s" % (f, fromnode, tonode), stability=stability, fromnode=get_name(fromnode), tonode=get_name(tonode), fromvalue=ft, tovalue=tt, family=families[fam], retention_innovation="Retention" if ft == tt else "Innovation")) DBSession.flush() _s = checkpoint(_s, 'stability and transitions loaded') imps = feature_dependencies(datatriples) _s = checkpoint(_s, 'feature_dependencies computed') if True: (H, V) = dependencies_graph([(v, f1, f2) for ((v, dstats), f1, f2) in imps]) _s = checkpoint(_s, 'dependencies_graph written') for (i, ((v, dstats), f1, f2)) in enumerate(imps): combinatory_status = ("primary" if (f1, f2) in H else ("epiphenomenal" if v > 0.0 else None)) if H else "N/A" DBSession.add(Dependency( id="%s->%s" % (f1, f2), strength=v, feature1=features[f1], feature2=features[f2], representation=dstats["representation"], combinatory_status=combinatory_status, jsondata=dstats)) DBSession.flush() _s = checkpoint(_s, 'dependencies loaded') coordinates = { lg.id: (lg.longitude, lg.latitude) for lg in DBSession.query(common.Language) .filter(common.Language.longitude != None) .filter(common.Language.latitude != None)} deepfams = deep_families(datatriples, clfps, coordinates=coordinates) _s = checkpoint(_s, '%s deep_families computed' % len(deepfams)) missing_families = set() data = Data() for ((l1, l2), support_value, significance, supports, f1c, f2c) in deepfams: dname = "proto-%s x proto-%s" % (glottolog_names[l1], glottolog_names[l2]) kmdistance = havdist(f1c, f2c) (f1lon, f1lat) = f1c if f1c else (None, None) (f2lon, f2lat) = f2c if f2c else (None, None) for li in [l1, l2]: if li not in families: missing_families.add(li) deepfam = DeepFamily( id=dname, support_value=support_value, significance=significance, family1=families.get(l1), family2=families.get(l2), family1_latitude = f1lat, family1_longitude = f1lon, family2_latitude = f2lat, family2_longitude = f2lon, geographic_plausibility = kmdistance) DBSession.add(deepfam) for (f, v1, v2, historical_score, independent_score, support_score) in supports: vid = ("%s: %s %s %s" % (f, v1, "==" if v1 == v2 else "!=", v2)).replace(".", "") #vname = ("%s|%s" % (v1, v2)).replace(".", "") #print vid, vname if vid not in data["Support"]: data.add( Support, vid, id = vid, historical_score = historical_score, independent_score = independent_score, support_score = support_score, value1= v1, value2 = v2, feature=features[f]) DBSession.add(HasSupport( id=dname + "-" + vid, deepfamily = deepfam, support = data["Support"][vid])) print('missing_families:') print(missing_families) DBSession.flush() _s = checkpoint(_s, 'deep_families loaded') compute_language_sources()
def main(args): active_only = not args.all coords = dict( (r[0], r[1:]) for r in dsv.rows(args.data_file('coordinates.tab'))) codes = dict((row[0], row[1]) for row in DBSession.execute( "select ll.hid, l.pk from languoid as ll, language as l where ll.pk = l.pk and ll.hid is not null" )) maxid = DBSession.execute( "select pk from languoid order by pk desc limit 1").fetchone()[0] gcs = {} lnames = {} for row in DBSession.execute("select pk, name from language"): lnames[row[0]] = row[1] # dict mapping branches (i.e. tuples of sub-family names) to dicts of H-languages families = OrderedDict() # dict mapping identifiers of H-languages to branches languages = OrderedDict() parse_families(args.data_file('lff.txt'), families, languages) # handle isolates / collapse families with exactly one leaf: isolate_names = {} collapsed_names = {} for key in families.keys(): if len(families[key]) == 1: if len(key) == 1: # isolate languages[families[key].keys()[0]][0] = None isolate_names[key[0]] = families[key].keys()[ 0] # map name to code else: languages[families[key].keys()[0]][0] = key[:-1] collapsed_names[key[-1]] = families[key].keys()[0] del families[key] # we also want to be able to lookup families by name names = {} for branch in families: name = branch[-1] if name in names: names[name].append(branch) else: names[name] = [branch] # now add the unclassifiabble, unattested, un-whatever parse_families(args.data_file('lof.txt'), families, languages) ncodes = {} languoids = [] for code in languages: if code not in codes: maxid += 1 ncodes[code] = maxid hnode, status, name, comment = languages[code] # we have to insert a new H-language! attrs = languoid( maxid, 'language', hid=code, id=glottocode(unicode(name), DBSession, gcs), name=name, hname=name, status=status, globalclassificationcomment=comment or None, ) print '++', attrs if coords.get(code): attrs['longitude'], attrs['latitude'] = map( float, coords.get(code)) languoids.append(attrs) urnodes = {} rnodes = {} for family in families: leafs = families[family] assert family[0] not in ['Speech Register', 'Spurious'] leafs = tuple( sorted(code for code in families[family].keys() if code in codes)) assert leafs if leafs in rnodes: # special case: there may be additional "Unclassified something" nodes in # branch without any changes in the set of leafs. assert [n for n in family if n.startswith('Unclassified')] fset, rset = set(family), set(rnodes[leafs]) assert rset.issubset(fset) assert leafs not in urnodes urnodes[leafs] = family #if len(family) > rnodes[leafs]: # rnodes[leafs] = family else: rnodes[leafs] = family # # at this point rnodes is a consolidated mapping of sets of H-Languages to branches in # the family tree. # # for set comparisons we compute a list of actual sets of leafs as well leafsets = [set(t) for t in sorted(rnodes.keys(), key=lambda s: len(s))] todo = [] # dict mapping (id, name, level) tuples for gl languoids of level family to tuples of leafs glnodes = {} # # note: all languoids with level null have children, thus are not dialects! # sql = "select l.pk, l.name, ll.level, ll.father_pk from languoid as ll, language as l where ll.pk = l.pk and ll.level = 'family' or ll.level is null" if active_only: sql = "select l.pk, l.name, ll.level, ll.father_pk from languoid as ll, language as l where ll.pk = l.pk and ll.level = 'family' and l.active = true" for row in DBSession.execute(sql).fetchall(): leafs = [ r[0] for r in DBSession.execute( "select distinct l.hid from treeclosuretable as t, languoid as l where t.child_pk = l.pk and t.parent_pk = %s and l.hid is not null and l.status != 'provisional'" % row[0]) ] if leafs: glnodes[(row[0], row[2], row[1], row[3])] = tuple(sorted(leafs)) else: # families without leafs will be marked as retired if row[1] in names and len(names[row[1]]) == 1: # unique family name, good enough for a match!? todo.append(Migration(row[0], None, pointer=names[row[1]][0])) else: todo.append(Migration(row[0], None)) # note: for legacy gl nodes, we map leaf-tuples to lists of matching nodes! rglnodes = {} for node, leafs in glnodes.items(): if leafs in rglnodes: rglnodes[leafs].append(node) else: rglnodes[leafs] = [node] # now we look for matches between old and new classification: for leafs, nodes in rglnodes.items(): assert leafs assert nodes todo.extend(match_nodes(leafs, nodes, rnodes, urnodes, leafsets, names)) # compile a mapping for exact matches: branch_to_pk = {} for m in todo: if m.hid: assert m.hid not in branch_to_pk branch_to_pk[m.hid] = m.pk new = 0 for hnode in sorted(families.keys(), key=lambda b: (len(b), b)): # loop through branches breadth first to determine what's to be inserted if hnode not in branch_to_pk: t = tuple(sorted(families[hnode].keys())) if t in rglnodes: # the "Unclassified subfamily" special case from above: assert [n for n in hnode if n.startswith('Unclassified')] # make sure, the existing glottolog family for the set of leafs is mapped # to some other node in the new classification: assert rglnodes[t][0][0] in [m.pk for m in todo if m.hid] maxid += 1 attrs = languoid( maxid, 'family', id=glottocode(unicode(hnode[-1]), DBSession, gcs), name=hnode[-1], hname=hnode[-1], ) branch_to_pk[hnode] = maxid lnames[maxid] = hnode[-1] if len(hnode) > 1: attrs['father_pk'] = branch_to_pk[tuple(list(hnode)[:-1])] assert attrs['father_pk'] print '++', attrs new += 1 languoids.append(attrs) # now on to the updates for families: matches, migrations, nomatches = 0, 0, 0 for m in todo: attrs = languoid(m.pk, 'family', name=lnames[m.pk]) if m.hid: #print '==', lnames[m.pk].encode('utf8'), '->', ', '.join(m.hid).encode('utf8') matches += 1 if len(m.hid) > 1: attrs['father_pk'] = branch_to_pk[tuple(list(m.hid)[:-1])] if getattr(m, 'rename', False): attrs['name'] = m.hid[-1] attrs['hname'] = m.hid[-1] else: attrs['active'] = False if getattr(m, 'pointer', False): print '~~', lnames[m.pk].encode('utf8'), '->', ', '.join( m.pointer).encode('utf8') migrations += 1 attrs['replacement'] = branch_to_pk[m.pointer] else: print '--', lnames[m.pk].encode('utf8'), '->' nomatches += 1 languoids.append(attrs) print matches, 'matches' print migrations, 'migrations' print nomatches, 'nomatches' print new, 'new nodes' risolate_names = dict(zip(isolate_names.values(), isolate_names.keys())) rcollapsed_names = dict( zip(collapsed_names.values(), collapsed_names.keys())) # and updates of father_pks for languages: for l in languages: hnode, status, name, comment = languages[l] id_ = codes.get(l, ncodes.get(l)) attrs = languoid(id_, 'language', status=status) if hnode: attrs['father_pk'] = branch_to_pk[hnode] attrs['globalclassificationcomment'] = comment or None # look for hnames! if l in risolate_names: attrs['hname'] = risolate_names[l] if l in rcollapsed_names: attrs['hname'] = rcollapsed_names[l] languoids.append(attrs) for row in DBSession.execute( "select l.pk, ll.hid, l.name from languoid as ll, language as l where ll.pk = l.pk and ll.hid like '%NOCODE_%'" ).fetchall(): if row[1] not in languages: # languoids with Harald's private code that are no longer in use attrs = languoid(row[0], 'language', status='retired', active=False, father_pk=None) languoids.append(attrs) with open(args.data_file('languoids.json'), 'w') as fp: json.dump(languoids, fp)
def main(args): # pragma: no cover stats = Counter(new=0, updated=0, skipped=0) changes = {} with transaction.manager: update_providers(args) DBSession.flush() provider_map = get_map(Provider) macroarea_map = get_map(Macroarea) doctype_map = get_map(Doctype) languoid_map = {} for l in DBSession.query(Languoid): if l.hid: languoid_map[l.hid] = l languoid_map[l.id] = l for i, rec in enumerate(get_bib(args)): if i and i % 1000 == 0: print i, 'records done', stats['updated'] + stats['new'], 'changed' if len(rec.keys()) < 6: # not enough information! stats.update(['skipped']) continue changed = False assert rec.get('glottolog_ref_id') id_ = int(rec.get('glottolog_ref_id')) ref = DBSession.query(Source).get(id_) update = True if ref else False kw = { 'pk': id_, 'bibtex_type': rec.genre, 'id': str(id_), 'jsondata': {'bibtexkey': rec.id}, } for source, target in FIELD_MAP.items(): if target is None: continue value = rec.get(source) if value: value = unescape(value) if target: kw[target] = CONVERTER.get(source, lambda x: x)(value) else: kw['jsondata'][source] = value if kw['jsondata'].get('hhtype'): trigger = ca_trigger(kw['jsondata']['hhtype']) if trigger: kw['ca_doctype_trigger'], kw['jsondata']['hhtype'] = trigger # try to extract numeric year, startpage, endpage, numberofpages, ... if kw.get('year'): # prefer years in brackets over the first 4-digit number. match = PREF_YEAR_PATTERN.search(kw.get('year')) if match: kw['year_int'] = int(match.group('year')) else: match = YEAR_PATTERN.search(kw.get('year')) if match: kw['year_int'] = int(match.group('year')) if kw.get('publisher'): p = kw.get('publisher') if ':' in p: address, publisher = [s.strip() for s in kw['publisher'].split(':', 1)] if 'address' not in kw or kw['address'] == address: kw['address'], kw['publisher'] = address, publisher if rec.get('numberofpages'): try: kw['pages_int'] = int(rec.get('numberofpages').strip()) except ValueError: pass if kw.get('pages'): start, end, number = compute_pages(kw['pages']) if start is not None: kw['startpage_int'] = start if end is not None: kw['endpage_int'] = end if number is not None and 'pages_int' not in kw: kw['pages_int'] = number for k in kw.keys(): v = kw[k] if isinstance(v, basestring): v = v.strip() or None kw[k] = v if update: for k in kw.keys(): if k == 'pk': continue v = getattr(ref, k) if kw[k] != v: if k == 'jsondata': d = {k: v for k, v in ref.jsondata.items() if k in NONREF_JSONDATA} d.update(**kw[k]) ref.jsondata = d else: #print k, '--', v #print k, '++', kw[k] setattr(ref, k, kw[k]) changed = True if ref.id in changes: changes[ref.id][k] = ('%s' % v, '%s' % kw[k]) else: changes[ref.id] = {k: ('%s' % v, '%s' % kw[k])} else: changed = True ref = Ref(name='%s %s' % (kw.get('author', 'na'), kw.get('year', 'nd')), **kw) ref.description = ref.title or ref.booktitle originator = ref.author or ref.editor or 'Anonymous' ref.name = '%s %s' % (originator, ref.year or 'n.d.') a, r = update_relationship( ref.macroareas, [macroarea_map[name] for name in set(filter(None, [s.strip() for s in kw['jsondata'].get('macro_area', '').split(',')]))]) changed = changed or a or r src = [s.strip() for s in kw['jsondata'].get('src', '').split(',')] prv = {provider_map[slug(s)] for s in src if s} if set(ref.providers) != prv: ref.providers = list(prv) changed = True a, r = update_relationship( ref.doctypes, [doctype_map[m.group('name')] for m in DOCTYPE_PATTERN.finditer(kw['jsondata'].get('hhtype', ''))]) changed = changed or a or r if not update: stats.update(['new']) DBSession.add(ref) elif changed: stats.update(['updated']) args.log.info('%s' % stats) DBSession.execute("update source set description = title where description is null and title is not null;") DBSession.execute("update source set description = booktitle where description is null and booktitle is not null;") for row in list(DBSession.execute( "select pk, pages, pages_int, startpage_int from source where pages_int < 0")): pk, pages, number, start = row _start, _end, _number = compute_pages(pages) if _number > 0 and _number != number: DBSession.execute( "update source set pages_int = %s, startpage_int = %s where pk = %s" % (_number, _start, pk)) DBSession.execute( "update ref set endpage_int = %s where pk = %s" % (_end, pk)) jsondump(changes, args.data_dir.joinpath('references', 'changes.json'))
def main(args): active_only = not args.all codes = dict((row[0], row[1]) for row in DBSession.execute("select ll.hid, l.pk from languoid as ll, language as l where ll.pk = l.pk and ll.hid is not null")) maxid = DBSession.execute( "select pk from languoid order by pk desc limit 1").fetchone()[0] gcs = {} lnames = {} for row in DBSession.execute("select pk, name from language"): lnames[row[0]] = row[1] # dict mapping branches (i.e. tuples of sub-family names) to dicts of H-languages families = OrderedDict() # dict mapping identifiers of H-languages to branches languages = OrderedDict() parse_families(data_file(args, 'lff.txt'), families, languages) # handle isolates / collapse families with exactly one leaf: isolate_names = {} collapsed_names = {} for key in families.keys(): if len(families[key]) == 1: if len(key) == 1: # isolate languages[families[key].keys()[0]][0] = None isolate_names[key[0]] = families[key].keys()[0] # map name to code else: languages[families[key].keys()[0]][0] = key[:-1] collapsed_names[key[-1]] = families[key].keys()[0] del families[key] # we also want to be able to lookup families by name names = {} for branch in families: name = branch[-1] if name in names: names[name].append(branch) else: names[name] = [branch] # now add the unclassifiabble, unattested, un-whatever parse_families(data_file(args, 'lof.txt'), families, languages) ncodes = {} languoids = [] for code in languages: if code not in codes: maxid += 1 ncodes[code] = maxid hnode, status, name, comment = languages[code] # we have to insert a new H-language! attrs = languoid( maxid, 'language', hid=code, id=glottocode(unicode(name), DBSession, gcs), name=name, hname=name, status=status, globalclassificationcomment=comment or None, ) print '++', attrs languoids.append(attrs) urnodes = {} rnodes = {} for family in families: #leafs = families[family] assert family[0] not in ['Speech Register', 'Spurious'] leafs = tuple(sorted(code for code in families[family].keys() if code in codes)) try: assert leafs except: print 'Family with only new languages!!' print family continue #raise if leafs in rnodes: # so we have already seen this exact set of leaves. # # special case: there may be additional "Unclassified something" nodes in # branch without any changes in the set of leafs ... try: assert [n for n in family if n.startswith('Unclassified')] except: print family print leafs # ... or the full leafset contains new languages assert [code for code in families[family[:-1]].keys() if code in ncodes] fset, rset = set(family), set(rnodes[leafs]) assert rset.issubset(fset) assert leafs not in urnodes urnodes[leafs] = family #if len(family) > rnodes[leafs]: # rnodes[leafs] = family else: rnodes[leafs] = family # # at this point rnodes is a consolidated mapping of sets of H-Languages to branches in # the family tree. # # for set comparisons we compute a list of actual sets of leafs as well leafsets = [set(t) for t in sorted(rnodes.keys(), key=lambda s: len(s))] todo = [] # dict mapping (id, name, level) tuples for gl languoids of level family to tuples of leafs glnodes = {} # # note: all languoids with level null have children, thus are not dialects! # sql = "select l.pk, l.name, ll.level, ll.father_pk from languoid as ll, language as l where ll.pk = l.pk and ll.level = 'family' or ll.level is null" if active_only: sql = "select l.pk, l.name, ll.level, ll.father_pk from languoid as ll, language as l where ll.pk = l.pk and ll.level = 'family' and l.active = true" for row in DBSession.execute(sql).fetchall(): leafs = [r[0] for r in DBSession.execute( "select distinct l.hid from treeclosuretable as t, languoid as l where t.child_pk = l.pk and t.parent_pk = %s and l.hid is not null and l.status != 'provisional'" % row[0])] if leafs: glnodes[(row[0], row[2], row[1], row[3])] = tuple(sorted(leafs)) else: # families without leafs will be marked as retired if row[1] in names and len(names[row[1]]) == 1: # unique family name, good enough for a match!? todo.append(Migration(row[0], None, pointer=names[row[1]][0])) else: todo.append(Migration(row[0], None)) # note: for legacy gl nodes, we map leaf-tuples to lists of matching nodes! rglnodes = {} for node, leafs in glnodes.items(): if leafs in rglnodes: rglnodes[leafs].append(node) else: rglnodes[leafs] = [node] # now we look for matches between old and new classification: for leafs, nodes in rglnodes.items(): assert leafs assert nodes todo.extend(match_nodes(leafs, nodes, rnodes, urnodes, leafsets, names)) # compile a mapping for exact matches: branch_to_pk = {} for m in todo: if m.hid: if m.hid in branch_to_pk: if branch_to_pk[m.hid] != m.pk: # compare names: if lnames[m.pk] == m.hid[-1]: print '#### type1' branch_to_pk[m.hid] = m.pk elif lnames[branch_to_pk[m.hid]] == m.hid[-1]: print '#### type2' pass else: print m.hid print m.hid[-1] print lnames[m.pk] print branch_to_pk[m.hid] print m.pk raise ValueError else: #assert m.hid not in branch_to_pk branch_to_pk[m.hid] = m.pk new = 0 for hnode in sorted(families.keys(), key=lambda b: (len(b), b)): # loop through branches breadth first to determine what's to be inserted if hnode not in branch_to_pk: t = tuple(sorted(families[hnode].keys())) if t in rglnodes: # the "Unclassified subfamily" special case from above: try: assert [n for n in hnode if n.startswith('Unclassified')] except: # or the "new language inserted higher up" case! assert [code for code in families[hnode[:-1]].keys() if code in ncodes] #print hnode #print t #raise # make sure, the existing glottolog family for the set of leafs is mapped # to some other node in the new classification: assert rglnodes[t][0][0] in [m.pk for m in todo if m.hid] maxid += 1 attrs = languoid( maxid, 'family', id=glottocode(unicode(hnode[-1]), DBSession, gcs), name=hnode[-1], hname=hnode[-1], ) branch_to_pk[hnode] = maxid lnames[maxid] = hnode[-1] if len(hnode) > 1: attrs['father_pk'] = branch_to_pk[tuple(list(hnode)[:-1])] assert attrs['father_pk'] print '++', attrs new += 1 languoids.append(attrs) # now on to the updates for families: matches, migrations, nomatches = 0, 0, 0 for m in todo: attrs = languoid(m.pk, 'family', name=lnames[m.pk]) if m.hid: #print '==', lnames[m.pk].encode('utf8'), '->', ', '.join(m.hid).encode('utf8') matches += 1 if len(m.hid) > 1: attrs['father_pk'] = branch_to_pk[tuple(list(m.hid)[:-1])] if getattr(m, 'rename', False): attrs['name'] = m.hid[-1] attrs['hname'] = m.hid[-1] else: attrs['active'] = False if getattr(m, 'pointer', False): print '~~', lnames[m.pk].encode('utf8'), '->', ', '.join(m.pointer).encode('utf8') migrations += 1 attrs['replacement'] = branch_to_pk[m.pointer] else: print '--', lnames[m.pk].encode('utf8'), '->' nomatches += 1 languoids.append(attrs) print matches, 'matches' print migrations, 'migrations' print nomatches, 'nomatches' print new, 'new nodes' risolate_names = dict(zip(isolate_names.values(), isolate_names.keys())) rcollapsed_names = dict(zip(collapsed_names.values(), collapsed_names.keys())) # and updates of father_pks for languages: for l in languages: hnode, status, name, comment = languages[l] id_ = codes.get(l, ncodes.get(l)) attrs = languoid(id_, 'language', status=status) if hnode: attrs['father_pk'] = branch_to_pk[hnode] attrs['globalclassificationcomment'] = comment or None # look for hnames! if l in risolate_names: attrs['hname'] = risolate_names[l] if l in rcollapsed_names: attrs['hname'] = rcollapsed_names[l] languoids.append(attrs) for row in DBSession.execute( "select l.pk, ll.hid, l.name from languoid as ll, language as l where ll.pk = l.pk and ll.hid like '%NOCODE_%'" ).fetchall(): if row[1] not in languages: # languoids with Harald's private code that are no longer in use attrs = languoid( row[0], 'language', status='retired', active=False, father_pk=None) languoids.append(attrs) with open(data_file(args, 'languoids.json'), 'w') as fp: json.dump(languoids, fp)
def main(args): stats = Counter(new=0, matches=0, migrations=0, nomatches=0) l, ll = Language.__table__.alias('l'), Languoid.__table__.alias('ll') gl_languoids = list( DBSession.execute( select([l, ll], use_labels=True).where(l.c.pk == ll.c.pk)).fetchall()) # we collect a list of changes which we will store in a JSON file. changes = [] hid_to_pk = { row['ll_hid']: row['l_pk'] for row in gl_languoids if row['ll_hid'] } max_languoid_pk = max(*[row['l_pk'] for row in gl_languoids]) new_glottocodes = {} pk_to_name = {row['l_pk']: row['l_name'] for row in gl_languoids} # dict mapping branches (i.e. tuples of sub-family names) to dicts of H-languages hh_families = OrderedDict() # dict mapping identifiers (i.e. hid) of H-languages to branches hh_languages = OrderedDict() parse_families(args.data_dir.joinpath('languoids', 'lff.txt'), hh_families, hh_languages) # handle isolates / collapse families with exactly one leaf: isolate_names = {} collapsed_names = {} for key in hh_families.keys(): if len(hh_families[key]) == 1: if len(key) == 1: # isolate hh_languages[hh_families[key].keys()[0]][0] = None isolate_names[key[0]] = hh_families[key].keys()[ 0] # map name to code else: hh_languages[hh_families[key].keys()[0]][0] = key[:-1] collapsed_names[key[-1]] = hh_families[key].keys()[0] del hh_families[key] # now add the unclassifiabble, unattested, un-whatever parse_families(args.data_dir.joinpath('languoids', 'lof.txt'), hh_families, hh_languages) # we also want to be able to lookup families by name fname_to_branches = defaultdict(list) for branch in hh_families: fname_to_branches[branch[-1]].append(branch) new_hid_to_pk = {} for code, (hnode, status, name) in hh_languages.items(): if code not in hid_to_pk: # we have to insert a new H-language! max_languoid_pk += 1 new_hid_to_pk[code] = max_languoid_pk if name in pk_to_name.values(): args.log.warn('new code {1} for existing name {0}'.format( name, code)) changes.append( languoid(max_languoid_pk, 'language', hid=code, id=glottocode(unicode(name), DBSession, new_glottocodes), name=name, hname=name, status=status)) stats.update(['new_languages']) duplicate_leafset_to_branch = {} leafset_to_branch = {} for family, langs in hh_families.items(): leafs = get_leafset(hid for hid in langs.keys() if hid in hid_to_pk) if not leafs: args.log.info('Family with only new languages: %s, %s' % (family, langs)) continue if leafs in leafset_to_branch: # so we have already seen this exact set of leaves. # # special case: there may be additional "Unclassified something" nodes in # branch without any changes in the set of leafs ... if not [n for n in family if n.startswith('Unclassified')]: # ... or the full leafset contains new languages assert [ hid for hid in hh_families[family[:-1]].keys() if hid in new_hid_to_pk ] fset, rset = set(family), set(leafset_to_branch[leafs]) assert rset.issubset(fset) assert leafs not in duplicate_leafset_to_branch duplicate_leafset_to_branch[leafs] = family else: leafset_to_branch[leafs] = family # # at this point leafset_to_branch is a consolidated mapping of sets of H-Languages # to branches in the new family tree. # # for set comparisons we compute a list of actual sets (not tuples) of leafs # ordered by length. leafsets = [ set(t) for t in sorted(leafset_to_branch.keys(), key=lambda s: len(s)) ] todo = [] gl_family_to_leafset = {} def select_leafs(pk): l, tc = Languoid.__table__.alias( 'l'), TreeClosureTable.__table__.alias('tc') return [ r['l_hid'] for r in DBSession.execute( select([l, tc], use_labels=True).where( and_(l.c.pk == tc.c.child_pk, l.c.hid != None, l.c.status != LanguoidStatus.provisional, tc.c.parent_pk == pk))) ] for row in gl_languoids: if row['ll_level'] == LanguoidLevel.family and row['l_active']: leafs = get_leafset(select_leafs(row['l_pk'])) assert leafs glnode = GLNode(row['l_pk'], row['l_name'], row['ll_level'].name, row['ll_father_pk'], row['l_jsondata'].get('hname')) gl_family_to_leafset[glnode] = leafs # note: for legacy gl nodes, we map leaf-tuples to lists of matching nodes! leafset_to_gl_family = defaultdict(list) for node, leafs in gl_family_to_leafset.items(): leafset_to_gl_family[leafs].append(node) # now we look for matches between old and new classification: for leafs, nodes in leafset_to_gl_family.items(): todo.extend( match_nodes(args, leafs, nodes, leafset_to_branch, duplicate_leafset_to_branch, leafsets, fname_to_branches)) # compile a mapping for exact matches: branch_to_pk = {} for m in todo: if m.hid: if m.hid in branch_to_pk: if branch_to_pk[m.hid] != m.pk: # compare names: if pk_to_name[m.pk] == m.hid[-1]: args.log.info('#### type1') branch_to_pk[m.hid] = m.pk elif pk_to_name[branch_to_pk[m.hid]] == m.hid[-1]: args.log.info('#### type2') else: raise ValueError else: branch_to_pk[m.hid] = m.pk for hnode in sorted(hh_families.keys(), key=lambda b: (len(b), b)): # loop through branches breadth first to determine what's to be inserted if hnode not in branch_to_pk: t = get_leafset(hh_families[hnode].keys()) if t in leafset_to_gl_family: # the "Unclassified subfamily" special case from above: if not [n for n in hnode if n.startswith('Unclassified')]: assert [ hid for hid in hh_families[hnode[:-1]].keys() if hid in new_hid_to_pk ] # make sure, the existing glottolog family for the set of leafs is mapped # to some other node in the new classification: assert leafset_to_gl_family[t][0].pk in [ m.pk for m in todo if m.hid ] max_languoid_pk += 1 branch_to_pk[hnode] = max_languoid_pk pk_to_name[max_languoid_pk] = hnode[-1] attrs = languoid( max_languoid_pk, 'family', id=glottocode(unicode(hnode[-1]), DBSession, new_glottocodes), name=hnode[-1], hname=hnode[-1], ) if len(hnode) > 1: attrs['father_pk'] = branch_to_pk[tuple(list(hnode)[:-1])] assert attrs['father_pk'] stats.update(['new']) changes.append(attrs) # now on to the updates for families: for m in todo: attrs = languoid(m.pk, 'family', name=pk_to_name[m.pk]) if m.hid: stats.update(['matches']) if len(m.hid) > 1: attrs['father_pk'] = branch_to_pk[tuple(list(m.hid)[:-1])] if getattr(m, 'rename', False): attrs['name'] = m.hid[-1] attrs['hname'] = m.hid[-1] else: attrs['active'] = False # mark the languoid as obsolete. if getattr(m, 'pointer', False): print '~~', m.pk, pk_to_name[m.pk].encode('utf8'), '->', \ ', '.join(m.pointer).encode('utf8') stats.update(['migrations']) attrs['replacement'] = branch_to_pk[m.pointer] else: stats.update(['nomatches']) changes.append(attrs) args.log.info('%s' % stats) risolate_names = dict(zip(isolate_names.values(), isolate_names.keys())) rcollapsed_names = dict( zip(collapsed_names.values(), collapsed_names.keys())) # and updates of father_pks for languages: for l, (hnode, status, name) in hh_languages.items(): id_ = hid_to_pk.get(l) if not id_: id_ = new_hid_to_pk.get(l) attrs = languoid(id_, 'language', status=status) else: attrs = languoid(id_, 'language', status=status) # In case of existing languoids, we don't change the active flag! del attrs['active'] if id_ in pk_to_name and name != pk_to_name[id_]: if slug(pk_to_name[id_]) == slug(name): attrs['name'] = name if hnode: attrs['father_pk'] = branch_to_pk[hnode] # look for hnames! if l in risolate_names: attrs['hname'] = risolate_names[l] if l in rcollapsed_names: attrs['hname'] = rcollapsed_names[l] changes.append(attrs) for row in gl_languoids: hid = row['ll_hid'] if hid and 'NOCODE' in hid and hid not in hh_languages: # languoids with Harald's private code that are no longer in use changes.append( languoid(row['l_pk'], 'language', status='retired', active=False, father_pk=None)) jsondump(changes, args.data_dir.joinpath('languoids', 'changes.json'), indent=4)
def recreate_treeclosure(): DBSession.execute('delete from treeclosuretable') SQL = TreeClosureTable.__table__.insert() ltable = Languoid.__table__ # we compute the ancestry for each single languoid for lid, fid in DBSession.execute('select pk, father_pk from languoid').fetchall(): depth = 0 DBSession.execute(SQL, dict(child_pk=lid, parent_pk=lid, depth=depth)) tlf = None # now follow up the line of ancestors while fid: tlf = fid depth += 1 DBSession.execute(SQL, dict(child_pk=lid, parent_pk=fid, depth=depth)) fid = DBSession.execute( sql.select([ltable.c.father_pk]).where(ltable.c.pk == fid) ).fetchone()[0] DBSession.execute( 'UPDATE languoid SET family_pk = :tlf WHERE pk = :lid', locals()) # we also pre-compute counts of descendants for each languoid: for level in ['language', 'dialect', 'family']: DBSession.execute("""\ UPDATE languoid SET child_%(level)s_count = ( SELECT count(*) FROM treeclosuretable as t, languoid as l WHERE languoid.pk = t.parent_pk AND languoid.pk != t.child_pk AND t.child_pk = l.pk AND l.level = '%(level)s' )""" % locals()) DBSession.execute('COMMIT')
def load(args): glottolog = args.repos fts.index('fts_index', models.Ref.fts, DBSession.bind) DBSession.execute("CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;") version = assert_release(glottolog.repos) dataset = common.Dataset( id='glottolog', name="{0} {1}".format(glottolog.publication.web.name, version), publisher_name=glottolog.publication.publisher.name, publisher_place=glottolog.publication.publisher.place, publisher_url=glottolog.publication.publisher.url, license=glottolog.publication.license.url, domain=purl.URL(glottolog.publication.web.url).domain(), contact=glottolog.publication.web.contact, jsondata={'license_icon': 'cc-by.png', 'license_name': glottolog.publication.license.name}, ) data = Data() for e in glottolog.editors.values(): if e.current: ed = data.add(common.Contributor, e.id, id=e.id, name=e.name) common.Editor(dataset=dataset, contributor=ed, ord=int(e.ord)) DBSession.add(dataset) contrib = data.add(common.Contribution, 'glottolog', id='glottolog', name='Glottolog') DBSession.add(common.ContributionContributor( contribution=contrib, contributor=data['Contributor']['hammarstroem'])) # # Add Parameters: # add = functools.partial(add_parameter, data) add('fc', name='Family classification') add('sc', name='Subclassification') add('aes', args.repos.aes_status.values(), name=args.repos.aes_status.__defaults__['name'], pkw=dict( jsondata=dict( reference_id=args.repos.aes_status.__defaults__['reference_id'], sources=[attr.asdict(v) for v in args.repos.aes_sources.values()], scale=[attr.asdict(v) for v in args.repos.aes_status.values()])), dekw=lambda de: dict(name=de.name, number=de.ordinal, jsondata=dict(icon=de.icon)), ) add('med', args.repos.med_types.values(), name='Most Extensive Description', dekw=lambda de: dict( name=de.name, description=de.description, number=de.rank, jsondata=dict(icon=de.icon)), ) add('macroarea', args.repos.macroareas.values(), pkw=dict( description=args.repos.macroareas.__defaults__['description'], jsondata=dict(reference_id=args.repos.macroareas.__defaults__['reference_id'])), dekw=lambda de: dict( name=de.name, description=de.description, jsondata=dict(geojson=read_macroarea_geojson(args.repos, de.name, de.description)), ), ) add('ltype', args.repos.language_types.values(), name='Language Type', dekw=lambda de: dict(name=de.category, description=de.description), delookup='category', ) add('country', args.repos.countries, dekw=lambda de: dict(name=de.id, description=de.name), ) legacy = jsonlib.load(gc2version(args)) for gc, version in legacy.items(): data.add(models.LegacyCode, gc, id=gc, version=version) # # Now load languoid data, keeping track of relations that can only be inserted later. # lgsources = defaultdict(list) # Note: We rely on languoids() yielding languoids in the "right" order, i.e. such that top-level # nodes will precede nested nodes. This order must be preserved using an `OrderedDict`: nodemap = OrderedDict([(l.id, l) for l in glottolog.languoids()]) lgcodes = {k: v.id for k, v in args.repos.languoids_by_code(nodemap).items()} for lang in nodemap.values(): for ref in lang.sources: lgsources['{0.provider}#{0.bibkey}'.format(ref)].append(lang.id) load_languoid(glottolog, data, lang, nodemap) for gc in glottolog.glottocodes: if gc not in data['Languoid'] and gc not in legacy: common.Config.add_replacement(gc, None, model=common.Language) for obj in jsonlib.load(glottolog.references_path('replacements.json')): common.Config.add_replacement( '{0}'.format(obj['id']), '{0}'.format(obj['replacement']) if obj['replacement'] else None, model=common.Source) DBSession.flush() for doctype in glottolog.hhtypes: data.add( models.Doctype, doctype.id, id=doctype.id, name=doctype.name, description=doctype.description, abbr=doctype.abbv, ord=doctype.rank) for bib in glottolog.bibfiles: data.add( models.Provider, bib.id, id=bib.id, name=bib.title, description=bib.description, abbr=bib.abbr, url=bib.url) DBSession.flush() s = time() for i, entry in enumerate( BibFile(glottolog.build_path('monster-utf8.bib'), api=glottolog).iterentries()): if i % 10000 == 0: args.log.info('{0}: {1:.3}'.format(i, time() - s)) s = time() ref = load_ref(data, entry, lgcodes, lgsources) if 'macro_area' in entry.fields: mas = [] for ma in split_text(entry.fields['macro_area'], separators=',;', strip=True): ma = 'North America' if ma == 'Middle America' else ma ma = glottolog.macroareas.get('Papunesia' if ma == 'Papua' else ma) mas.append(ma.name) ref.macroareas = ', '.join(mas)
def load(args): glottolog = args.repos fts.index('fts_index', models.Ref.fts, DBSession.bind) DBSession.execute("CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;") version = assert_release(glottolog.repos) dataset = common.Dataset( id='glottolog', name="Glottolog {0}".format(version), publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="https://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='glottolog.org', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) data = Data() for i, (id_, name) in enumerate([ ('hammarstroem', 'Harald Hammarström'), ('forkel', 'Robert Forkel'), ('haspelmath', 'Martin Haspelmath'), ]): ed = data.add(common.Contributor, id_, id=id_, name=name) common.Editor(dataset=dataset, contributor=ed, ord=i + 1) DBSession.add(dataset) clf = data.add(common.Contribution, 'clf', id='clf', name='Classification') DBSession.add(common.ContributionContributor( contribution=clf, contributor=data['Contributor']['hammarstroem'])) for pid, pname in [ ('fc', 'Family classification'), ('sc', 'Subclassification'), ('vitality', 'Degree of endangerment'), ]: data.add(common.Parameter, pid, id=pid, name=pname) legacy = jsonlib.load(gc2version(args)) for gc, version in legacy.items(): data.add(models.LegacyCode, gc, id=gc, version=version) for ma in Macroarea: data.add( models.Macroarea, ma.name, id=ma.name, name=ma.value, description=ma.description) for country in glottolog.countries: data.add(models.Country, country.id, id=country.id, name=country.name) lgcodes, mas, countries, lgsources = {}, {}, {}, defaultdict(list) languoids = list(glottolog.languoids()) nodemap = {l.id: l for l in languoids} for lang in languoids: for ref in lang.sources: lgsources['{0.provider}#{0.bibkey}'.format(ref)].append(lang.id) load_languoid(data, lang, nodemap) mas[lang.id] = [ma.name for ma in lang.macroareas] countries[lang.id] = [c.id for c in lang.countries] lgcodes[lang.id] = lang.id if lang.hid: lgcodes[lang.hid] = lang.id if lang.iso: lgcodes[lang.iso] = lang.id for gc in glottolog.glottocodes: if gc not in data['Languoid'] and gc not in legacy: common.Config.add_replacement(gc, None, model=common.Language) for obj in jsonlib.load(glottolog.references_path('replacements.json')): common.Config.add_replacement( '{0}'.format(obj['id']), '{0}'.format(obj['replacement']) if obj['replacement'] else None, model=common.Source) DBSession.flush() for lid, maids in mas.items(): for ma in maids: DBSession.add(models.Languoidmacroarea( languoid_pk=data['Languoid'][lid].pk, macroarea_pk=data['Macroarea'][ma].pk)) for lid, cids in countries.items(): for cid in cids: DBSession.add(models.Languoidcountry( languoid_pk=data['Languoid'][lid].pk, country_pk=data['Country'][cid].pk)) for doctype in glottolog.hhtypes: data.add( models.Doctype, doctype.id, id=doctype.id, name=doctype.name, description=doctype.description, abbr=doctype.abbv, ord=doctype.rank) for bib in glottolog.bibfiles: data.add( models.Provider, bib.id, id=bib.id, name=bib.title, description=bib.description, abbr=bib.abbr, url=bib.url) DBSession.flush() s = time() for i, entry in enumerate( BibFile(glottolog.build_path('monster-utf8.bib')).iterentries()): if i % 10000 == 0: args.log.info('{0}: {1:.3}'.format(i, time() - s)) s = time() ref = load_ref(data, entry, lgcodes, lgsources) if 'macro_area' in entry.fields: for ma in split_text(entry.fields['macro_area'], separators=',;', strip=True): ma = 'North America' if ma == 'Middle America' else ma ma = Macroarea.get('Papunesia' if ma == 'Papua' else ma) DBSession.add(models.Refmacroarea( ref_pk=ref.pk, macroarea_pk=data['Macroarea'][ma.name].pk))
def prime_cache(args): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodically whenever data has been updated. """ labels = {} for type_, cls in [('source', common.Source), ('unit', common.Unit)]: labels[type_] = defaultdict(dict) for r in DBSession.query(cls.id, cls.name): sid, _, lid = r[0].partition('-') labels[type_][sid][lid] = r[1] link_map = {'entry': 'unit', 'source': 'source'} for d in DBSession.query(Dictionary): if d.description: soup = BeautifulSoup(markdown(d.description), 'html.parser') for a in soup.find_all('a', href=True): if a['href'] in link_map: type_ = link_map[a['href']] if a.string in labels[type_][d.id]: a['href'] = args.env['request'].route_url( type_, id='{0}-{1}'.format(d.id, a.string)) a.string = labels[type_][d.id][a.string] if type_ == 'unit': a['class'] = 'lemma' d.description, d.toc = toc(soup) for meaning in DBSession.query(ComparisonMeaning).options( joinedload_all(common.Parameter.valuesets, common.ValueSet.values) ): meaning.representation = sum([len(vs.values) for vs in meaning.valuesets]) if meaning.representation == 0: meaning.active = False def joined(iterable): return ' / '.join(sorted(nfilter(set(iterable)))) q = DBSession.query(Word)\ .order_by(Word.dictionary_pk, common.Unit.name, common.Unit.pk)\ .options(joinedload(Word.meanings), joinedload(Word.dictionary)) for _, words in groupby(q, lambda u: u.name): words = list(words) for i, word in enumerate(words): word.description = ' / '.join(m.name for m in word.meanings) word.comparison_meanings = joined(m.reverse for m in word.meanings) word.semantic_domain = joined(m.semantic_domain for m in word.meanings) word.number = i + 1 if len(words) > 1 else 0 for suffix in ['1', '2']: alt_t, alt_l = [], [] for m in word.meanings: if getattr(m, 'alt_translation' + suffix): alt_l.append(getattr(m, 'alt_translation_language' + suffix)) alt_t.append(getattr(m, 'alt_translation' + suffix)) if alt_t and len(set(alt_l)) == 1: DBSession.add(common.Unit_data( object_pk=word.pk, key='lang-' + alt_l.pop(), value=join(alt_t))) def count_unit_media_files(contrib, mtype, cls=common.Unit_files): if cls == common.Unit_files: return DBSession.query(common.Unit_files)\ .join(Word, common.Unit_files.object_pk == Word.pk)\ .filter(Word.dictionary_pk == contrib.pk)\ .filter(common.Unit_files.mime_type.ilike(mtype + '/%'))\ .count() + \ DBSession.query(Meaning_files)\ .join(Meaning, Meaning_files.object_pk == Meaning.pk)\ .join(Word, Meaning.word_pk == Word.pk)\ .filter(Word.dictionary_pk == contrib.pk)\ .filter(Meaning_files.mime_type.ilike(mtype + '/%'))\ .count() return DBSession.query(common.Sentence_files)\ .join(common.Sentence, common.Sentence_files.object_pk == common.Sentence.pk)\ .filter(Example.dictionary_pk == contrib.pk)\ .filter(common.Sentence_files.mime_type.ilike(mtype + '/%'))\ .count() for d in DBSession.query(Dictionary).options(joinedload(Dictionary.words)): d.count_words = len(d.words) sds = set(chain(*[w.semantic_domain_list for w in d.words])) d.semantic_domains = join(sorted(sds)) d.count_audio = count_unit_media_files(d, 'audio') d.count_example_audio = count_unit_media_files(d, 'audio', cls=common.Sentence_files) d.count_image = count_unit_media_files(d, 'image') word_pks = [w.pk for w in d.words] choices = {} for col in d.jsondata.get('custom_fields', []): values = [ r[0] for r in DBSession.query(common.Unit_data.value) .filter(common.Unit_data.object_pk.in_(word_pks)) .filter(common.Unit_data.key == col) .distinct()] if len(values) < 40: choices[col] = sorted(values) d.update_jsondata(choices=choices) DBSession.execute(""" UPDATE word SET example_count = s.c FROM ( SELECT m.word_pk AS wpk, count(ms.sentence_pk) AS c FROM meaning AS m, meaningsentence AS ms WHERE m.pk = ms.meaning_pk GROUP BY m.word_pk ) AS s WHERE word.pk = s.wpk """)
def prime_cache(cfg): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodically whenever data has been updated. """ for meaning in DBSession.query(ComparisonMeaning).options( joinedload_all(common.Parameter.valuesets, common.ValueSet.values) ): meaning.representation = sum([len(vs.values) for vs in meaning.valuesets]) if meaning.representation == 0: meaning.active = False q = DBSession.query(Word)\ .order_by(Word.dictionary_pk, common.Unit.name, common.Unit.pk)\ .options(joinedload(Word.meanings), joinedload(Word.dictionary)) for _, words in groupby(q, lambda u: u.name): words = list(words) for i, word in enumerate(words): word.description = ' / '.join(m.name for m in word.meanings) word.comparison_meanings = ' / '.join(nfilter(m.reverse for m in word.meanings)) word.semantic_domain = ' / '.join(nfilter(m.semantic_domain for m in word.meanings)) word.number = i + 1 if len(words) > 1 else 0 for suffix in ['1', '2']: alt_t, alt_l = [], [] for m in word.meanings: if getattr(m, 'alt_translation' + suffix): alt_l.append(getattr(m, 'alt_translation_language' + suffix)) alt_t.append(getattr(m, 'alt_translation' + suffix)) if alt_t and len(set(alt_l)) == 1: DBSession.add(common.Unit_data( object_pk=word.pk, key='lang-' + alt_l.pop(), value=join(alt_t))) def count_unit_media_files(contrib, mtype): return DBSession.query(common.Unit_files)\ .join(Word, common.Unit_files.object_pk == Word.pk)\ .filter(Word.dictionary_pk == contrib.pk)\ .filter(common.Unit_files.mime_type.ilike(mtype + '/%'))\ .count() for d in DBSession.query(Dictionary).options(joinedload(Dictionary.words)): d.count_words = len(d.words) sds = set(chain(*[w.semantic_domain_list for w in d.words])) d.semantic_domains = join(sorted(sds)) d.count_audio = count_unit_media_files(d, 'audio') d.count_image = count_unit_media_files(d, 'image') word_pks = [w.pk for w in d.words] choices = {} for col in d.jsondata.get('custom_fields', []): values = [ r[0] for r in DBSession.query(common.Unit_data.value) .filter(common.Unit_data.object_pk.in_(word_pks)) .filter(common.Unit_data.key == col) .distinct()] if len(values) < 40: choices[col] = sorted(values) d.update_jsondata(choices=choices) DBSession.execute(""" UPDATE word SET example_count = s.c FROM ( SELECT m.word_pk AS wpk, count(ms.sentence_pk) AS c FROM meaning AS m, meaningsentence AS ms WHERE m.pk = ms.meaning_pk GROUP BY m.word_pk ) AS s WHERE word.pk = s.wpk """)
def main(args): fts.index('fts_index', Word.fts, DBSession.bind) DBSession.execute("CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;") if DBSession.bind.dialect.name == 'postgresql': Index('ducet', collkey(common.Unit.name)).create(DBSession.bind) data = Data() dataset = common.Dataset( id=dictionaria.__name__, name="Dictionaria", description="The Dictionary Journal", published=date(2017, 3, 30), contact='*****@*****.**', domain='dictionaria.clld.org', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="https://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) for i, (id_, name) in enumerate([ ('haspelmathmartin', 'Martin Haspelmath'), ('stiebelsbarbara', 'Barbara Stiebels') ]): ed = data.add(common.Contributor, id_, id=id_, name=name) common.Editor(dataset=dataset, contributor=ed, ord=i + 1) DBSession.add(dataset) for id_, name in LGR_ABBRS.items(): DBSession.add(common.GlossAbbreviation(id=id_, name=name)) comparison_meanings = {} print('loading concepts ...') glosses = set() concepticon = Concepticon( REPOS.joinpath('..', '..', 'concepticon', 'concepticon-data')) if not args.no_concepts: for conceptset in concepticon.conceptsets.values(): if conceptset.gloss in glosses: continue glosses.add(conceptset.gloss) cm = data.add( ComparisonMeaning, conceptset.id, id=conceptset.id, name=conceptset.gloss.lower(), description=conceptset.definition, concepticon_url='http://concepticon.clld.org/parameters/%s' % conceptset.id) comparison_meanings[cm.id] = cm DBSession.flush() print('... done') comparison_meanings = {k: v.pk for k, v in comparison_meanings.items()} submissions = [] for submission in REPOS.joinpath( 'submissions-internal' if args.internal else 'submissions').glob('*'): if not submission.is_dir(): continue try: submission = Submission(submission) except ValueError: continue md = submission.md if md is None: print('no md', submission.id) continue if not md['date_published']: print('no date', submission.id) continue id_ = submission.id if args.dict and args.dict != id_ and args.dict != 'all': print('not selected', submission.id) continue lmd = md['language'] props = md.get('properties', {}) props.setdefault('custom_fields', []) props['metalanguage_styles'] = {} for v, s in zip(props.get('metalanguages', {}).values(), ['success', 'info', 'warning', 'important']): props['metalanguage_styles'][v] = s props['custom_fields'] = ['lang-' + f if f in props['metalanguage_styles'] else f for f in props['custom_fields']] props.setdefault('choices', {}) language = data['Variety'].get(lmd['glottocode']) if not language: language = data.add( Variety, lmd['glottocode'], id=lmd['glottocode'], name=lmd['name']) md['date_published'] = md['date_published'] or date.today().isoformat() if '-' not in md['date_published']: md['date_published'] = md['date_published'] + '-01-01' dictionary = data.add( Dictionary, id_, id=id_, number=md.get('number'), name=props.get('title', lmd['name'] + ' dictionary'), description=submission.description, language=language, published=date(*map(int, md['date_published'].split('-'))), doi=md.get('doi'), jsondata=props) for i, spec in enumerate(md['authors']): if not isinstance(spec, dict): cname, address = spec, None spec = {} else: cname, address = spec['name'], spec.get('affiliation') name = HumanName(cname) cid = slug('%s%s' % (name.last, name.first)) contrib = data['Contributor'].get(cid) if not contrib: contrib = data.add( common.Contributor, cid, id=cid, name=cname, address=address, url=spec.get('url'), email=spec.get('email')) DBSession.add(common.ContributionContributor( ord=i + 1, primary=spec.get('primary', True), contributor=contrib, contribution=dictionary)) submissions.append((dictionary.id, language.id, submission)) transaction.commit() for did, lid, submission in submissions: transaction.begin() print('loading %s ...' % submission.id) dictdata = Data() lang = Variety.get(lid) submission.load_sources(Dictionary.get(did), dictdata) submission.load_examples(Dictionary.get(did), dictdata, lang) submission.dictionary.load( submission, dictdata, Dictionary.get(did), lang, comparison_meanings, OrderedDict(submission.md.get('properties', {}).get('labels', []))) transaction.commit() print('... done') transaction.begin() load_families( Data(), [v for v in DBSession.query(Variety) if re.match('[a-z]{4}[0-9]{4}', v.id)], glottolog_repos='../../glottolog/glottolog')
def prime(args): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodically whenever data has been updated. """ recreate_treeclosure() for lpk, mas in DBSession.execute("""\ select l.pk, array_agg(distinct lma.macroarea_pk) from language as l, treeclosuretable as t, languoidmacroarea as lma, macroarea as ma where l.pk = t.parent_pk and t.child_pk = lma.languoid_pk and lma.macroarea_pk = ma.pk and l.pk not in (select languoid_pk from languoidmacroarea) group by l.pk"""): for mapk in mas: DBSession.add(models.Languoidmacroarea(languoid_pk=lpk, macroarea_pk=mapk)) for row in list(DBSession.execute( "select pk, pages, pages_int, startpage_int from source where pages_int < 0" )): pk, pages, number, start = row _start, _end, _number = compute_pages(pages) if _number > 0 and _number != number: DBSession.execute( "update source set pages_int = %s, startpage_int = %s where pk = %s" % (_number, _start, pk)) DBSession.execute( "update ref set endpage_int = %s where pk = %s" % (_end, pk)) version = assert_release(args.repos.repos) with jsonlib.update(gc2version(args), indent=4) as legacy: for lang in DBSession.query(common.Language): if lang.id not in legacy: lang.update_jsondata(new=True) legacy[lang.id] = version def items(s): if not s: return set() r = [] for ss in set(s.strip().split()): if '**:' in ss: ss = ss.split('**:')[0] + '**' if ss.endswith(','): ss = ss[:-1].strip() r.append(ss) return set(r) refs = { r[0]: r[1] for r in DBSession.query(models.Refprovider.id, models.Refprovider.ref_pk)} valuesets = { r[0]: r[1] for r in DBSession.query(common.ValueSet.id, common.ValueSet.pk)} for lang in args.repos.languoids(): if lang.category == models.BOOKKEEPING: continue clf = lang.classification_comment if clf: if clf.subrefs: if items(lang.cfg['classification']['subrefs']) != \ items(lang.cfg['classification'].get('sub')): vspk = valuesets['sc-{0}'.format(lang.id)] for ref in clf.subrefs: spk = refs.get(ref.key) DBSession.add( common.ValueSetReference(source_pk=spk, valueset_pk=vspk)) if clf.familyrefs: if items(lang.cfg['classification']['familyrefs']) != \ items(lang.cfg['classification'].get('family')): vspk = valuesets['fc-{0}'.format(lang.id)] for ref in clf.familyrefs: spk = refs.get(ref.key) if spk: DBSession.add( common.ValueSetReference(source_pk=spk, valueset_pk=vspk))
def main(args): # pragma: no cover bib = Database.from_file(args.data_file(args.version, 'refs.bib'), encoding='utf8') count = 0 skipped = 0 changes = {} with transaction.manager: update_providers(args) DBSession.flush() provider_map = get_map(Provider) macroarea_map = get_map(Macroarea) doctype_map = get_map(Doctype) known_ids = set(r[0] for r in DBSession.query(Ref.pk)) languoid_map = {} for l in DBSession.query(Languoid): if l.hid: languoid_map[l.hid] = l languoid_map[l.id] = l for i, rec in enumerate(bib): if i and i % 1000 == 0: print i, 'records done', count, 'changed' if len(rec.keys()) < 6: # not enough information! skipped += 1 continue changed = False assert rec.get('glottolog_ref_id') id_ = int(rec.get('glottolog_ref_id')) if args.mode != 'update' and id_ in known_ids: continue ref = DBSession.query(Source).get(id_) update = True if ref else False kw = { 'pk': id_, 'bibtex_type': rec.genre, 'id': str(id_), 'jsondata': {'bibtexkey': rec.id}, } for source, target in FIELD_MAP.items(): if target is None: continue value = rec.get(source) if value: value = unescape(value) if target: kw[target] = CONVERTER.get(source, lambda x: x)(value) else: kw['jsondata'][source] = value if kw['jsondata'].get('hhtype'): trigger = ca_trigger(kw['jsondata']['hhtype']) if trigger: kw['ca_doctype_trigger'], kw['jsondata']['hhtype'] = trigger # try to extract numeric year, startpage, endpage, numberofpages, ... if kw.get('year'): # prefer years in brackets over the first 4-digit number. match = PREF_YEAR_PATTERN.search(kw.get('year')) if match: kw['year_int'] = int(match.group('year')) else: match = YEAR_PATTERN.search(kw.get('year')) if match: kw['year_int'] = int(match.group('year')) if kw.get('publisher'): p = kw.get('publisher') if ':' in p: address, publisher = [s.strip() for s in kw['publisher'].split(':', 1)] if not 'address' in kw or kw['address'] == address: kw['address'], kw['publisher'] = address, publisher if rec.get('numberofpages'): try: kw['pages_int'] = int(rec.get('numberofpages').strip()) except ValueError: pass if kw.get('pages'): start, end, number = compute_pages(kw['pages']) if start is not None: kw['startpage_int'] = start if end is not None: kw['endpage_int'] = end if number is not None and 'pages_int' not in kw: kw['pages_int'] = number if update: for k in kw.keys(): if k == 'pk': continue v = getattr(ref, k) if kw[k] != v: if k == 'jsondata': d = ref.jsondata or {} d.update(**kw[k]) for s, t in FIELD_MAP.items(): if t is None and s in d: del d[s] ref.jsondata = d else: print k, '--', v print k, '++', kw[k] setattr(ref, k, kw[k]) changed = True if ref.id in changes: changes[ref.id][k] = ('%s' % v, '%s' % kw[k]) else: changes[ref.id] = {k: ('%s' % v, '%s' % kw[k])} else: changed = True ref = Ref(name='%s %s' % (kw.get('author', 'na'), kw.get('year', 'nd')), **kw) ref.description = ref.title or ref.booktitle ref.name = '%s %s' % (ref.author or 'n.a.', ref.year or 'n.d.') def append(attr, obj): if obj and obj not in attr: attr.append(obj) return True a, r = update_relationship( ref.macroareas, [macroarea_map[name] for name in set(filter(None, [s.strip() for s in kw['jsondata'].get('macro_area', '').split(',')]))]) changed = changed or a or r for name in set(filter(None, [s.strip() for s in kw['jsondata'].get('src', '').split(',')])): result = append(ref.providers, provider_map[slug(name)]) changed = changed or result a, r = update_relationship( ref.doctypes, [doctype_map[m.group('name')] for m in DOCTYPE_PATTERN.finditer(kw['jsondata'].get('hhtype', ''))]) changed = changed or a or r if not update: DBSession.add(ref) if changed: count += 1 ref.doctypes_str = ', '.join(o.id for o in ref.doctypes) ref.providers_str = ', '.join(o.id for o in ref.providers) print count, 'records updated or imported' print skipped, 'records skipped because of lack of information' DBSession.execute("update source set description = title where description is null and title is not null;") DBSession.execute("update source set description = booktitle where description is null and booktitle is not null;") for row in list(DBSession.execute( "select pk, pages, pages_int, startpage_int from source where pages_int < 0")): pk, pages, number, start = row _start, _end, _number = compute_pages(pages) if _number > 0 and _number != number: DBSession.execute( "update source set pages_int = %s, startpage_int = %s where pk = %s", (_number, _start, pk)) DBSession.execute( "update ref set endpage_int = %s where pk = %s", (_end, pk)) return changes
def prime_cache(args): DBSession.execute('delete from treeclosuretable') SQL = models2.TreeClosureTable.__table__.insert() ltable = models2.Languoid.__table__ # we compute the ancestry for each single languoid for lid, fid in DBSession.execute( 'select pk, father_pk from languoid').fetchall(): depth = 0 DBSession.execute(SQL, dict(child_pk=lid, parent_pk=lid, depth=depth)) # now follow up the line of ancestors while fid: depth += 1 DBSession.execute(SQL, dict(child_pk=lid, parent_pk=fid, depth=depth)) fid = DBSession.execute( sql.select([ltable.c.father_pk ]).where(ltable.c.pk == fid)).fetchone()[0] # we also pre-compute counts of descendants for each languoid: for level in ['language', 'dialect', 'family']: DBSession.execute("""\ UPDATE languoid SET child_%(level)s_count = ( SELECT count(*) FROM treeclosuretable as t, languoid as l WHERE languoid.pk = t.parent_pk AND languoid.pk != t.child_pk AND t.child_pk = l.pk AND l.level = '%(level)s' )""" % locals()) DBSession.execute('COMMIT')
def handler(offset, batch): _values = [value(start + offset + i + 1, row) for i, row in enumerate(batch)] DBSession.execute(model.__table__.insert(), _values) values.extend(_values)
def create(args): args.log.info('starting migration ...') data = Data() db = create_engine('postgresql://robert@/glottolog2') with transaction.manager: sn = data.add(common.Contributor, 'sn', id='sn', name='Sebastian Nordhoff') hh = data.add(common.Contributor, 'hh', id='hh', name='Harald Hammarström') rf = data.add(common.Contributor, 'rf', id='rf', name='Robert Forkel', url="https://github.com/xrotwang") mh = data.add(common.Contributor, 'mh', id='mh', name='Martin Haspelmath') contrib = data.add(common.Contribution, 'c', id='classification', name='Classification') data.add(common.ContributionContributor, 'hh', contribution=contrib, contributor=hh) params = dict( fc=data.add(common.Parameter, 'fc', id='fc', name='Family classification'), sc=data.add(common.Parameter, 'sc', id='sc', name='Subclassification'), ) dataset = data.add( common.Dataset, 'd', id='glottolog', name='Glottolog 2.0', description='', published=datetime.date(2013, 8, 15), domain='glottolog.org', contact='*****@*****.**', jsondata={ 'license_icon': 'http://i.creativecommons.org/l/by-sa/3.0/88x31.png', 'license_name': 'Creative Commons Attribution-ShareAlike 3.0 Unported License'}) for i, ed in enumerate([sn, hh, rf, mh]): DBSession.add(common.Editor(dataset=dataset, contributor=ed, ord=i + 1)) valuesets = {} def create_languoid(row, father_pk=None): glottocode = {'akun1242': 'akun1241'}.get(row['alnumcode'], row['alnumcode']) attrs = dict( pk=row['id'], id=glottocode, name=row['primaryname'], description=row['globalclassificationcomment'], level=getattr(models2.LanguoidLevel, row['level']), status=getattr(models2.LanguoidStatus, (row['status'] or '').replace(' ', '_'), None), father_pk=father_pk, created=row['updated'], jsondata={} if not row['hname'] else {'hname': row['hname']}, ) for attr in [ 'active', 'updated', 'hid', 'latitude', 'longitude', ]: attrs[attr] = row[attr] l = data.add(models2.Languoid, row['id'], **attrs) for type_ in params: id_ = '%s%s' % (type_, row['id']) vs = data.add( common.ValueSet, id_, id=id_, description=row['classificationcomment'] if type_ == 'fc' else row['subclassificationcomment'], language=l, parameter=params[type_], contribution=contrib) data.add(common.Value, id_, id=id_, name='%s - %s' % (row['level'], row['status']), valueset=vs) DBSession.flush() valuesets[id_] = vs.pk return str(row['id']) level = 0 parents = [create_languoid(row) for row in db.execute('select * from languoidbase where father_id is null')] while parents: args.log.info('level: %s' % level) level += 1 parents = [ create_languoid(row, father_pk=data['Languoid'][row['father_id']].pk) for row in db.execute( 'select * from languoidbase where father_id in (%s)' % ','.join(parents))] def handler(offset, batch): svalues = [] rvalues = [] for row in batch: jsondata = json.loads(row['jsondata'] or "{}") jsondata['bibtexkey'] = row['bibtexkey'] dicts = { 's': dict( pk=row['id'], polymorphic_type='base', id=str(row['id']), name='%(author)s %(year)s' % row, description=row['title'], bibtex_type=getattr(EntryType, row['type']), jsondata=jsondata), 'r': dict(pk=row['id']), } for model, map_ in { 's': { 'author': None, 'yearstring': 'year', 'year': 'year_int', 'startpage': 'startpage_int', 'numberofpages': 'pages_int', 'pages': None, 'edition': None, 'school': None, 'address': None, 'url': None, 'note': None, 'number': None, 'series': None, 'editor': None, 'booktitle': None, 'journal': None, 'volume': None, 'publisher': None, }, 'r': { 'endpage': 'endpage_int', 'inlg': None, 'inlg_code': None, 'subject': None, 'subject_headings': None, 'keywords': None, 'normalizedauthorstring': None, 'normalizededitorstring': None, 'ozbib_id': None, } }.items(): for okey, nkey in map_.items(): dicts[model][nkey or okey] = row[okey] svalues.append(dicts['s']) rvalues.append(dicts['r']) DBSession.execute(common.Source.__table__.insert(), svalues) DBSession.execute(models2.Ref.__table__.insert(), rvalues) select(db, 'select * from refbase order by id', handler) DBSession.execute('COMMIT') for table, model, value, order in [ ('macroarea', models2.Macroarea, lambda i, row: dict( pk=row['id'], id=slug(row['name']), name=row['name'], description=row['description']), None), ('country', models2.Country, lambda i, row: dict( pk=row['id'], id=row['alpha2'], name=row['name']), None), ('provider', models2.Provider, lambda i, row: dict( pk=row['id'], id=slug(row['name']), name=row['description'], description=row['comment'], abbr=row['abbr'], url=row['url'], refurl=row['refurl'], bibfield=row['bibfield']), None), ('doctype', models2.Doctype, lambda i, row: dict( pk=row['id'], id=slug(row['name']), abbr=row['abbr'], name=row['name'], description=row['description']), None), ('refprovider', models2.Refprovider, lambda i, row: dict( pk=i, provider_pk=row['provider_id'], ref_pk=row['refbase_id']), ('provider_id', 'refbase_id')), ('refdoctype', models2.Refdoctype, lambda i, row: dict( pk=i, doctype_pk=row['doctype_id'], ref_pk=row['refbase_id']), ('doctype_id', 'refbase_id')), ]: insert(db, table, model, value, order=order) names = dict( (int(d['id']), d['pk']) for d in insert( db, 'namebase', common.Identifier, lambda i, row: dict( pk=i, id=str(row['id']), name=row['namestring'], type='name', description=row['nameprovider'], lang=row['inlg'] if row['inlg'] and len(row['inlg']) <= 3 else 'en'), order='id')) codes = dict( (int(d['id']), d['pk']) for d in insert( db, 'codebase', common.Identifier, lambda i, row: dict( pk=i, id=str(row['id']), name=row['codestring'], type=common.IdentifierType.iso.value if row['codeprovider'] == 'ISO' else row['codeprovider']), start=len(names), order='id')) res = insert( db, 'nodecodes', common.LanguageIdentifier, lambda i, row: dict( pk=i, language_pk=row['languoidbase_id'], identifier_pk=codes[row['codebase_id']])) insert( db, 'nodenames', common.LanguageIdentifier, lambda i, row: dict( pk=i, language_pk=row['languoidbase_id'], identifier_pk=names[row['namebase_id']]), start=len(res)) for table, model, value in [ ( 'languoidmacroarea', models2.Languoidmacroarea, lambda i, row: dict( pk=i, languoid_pk=row['languoidbase_id'], macroarea_pk=row['macroarea_id'])), ( 'languoidcountry', models2.Languoidcountry, lambda i, row: dict( pk=i, languoid_pk=row['languoidbase_id'], country_pk=row['country_id'])), ( 'noderefs', common.LanguageSource, lambda i, row: dict( pk=i, language_pk=row['languoidbase_id'], source_pk=row['refbase_id'])), ( 'refmacroarea', models2.Refmacroarea, lambda i, row: dict( pk=i, macroarea_pk=row['macroarea_id'], ref_pk=row['refbase_id'])), ( 'refcountry', models2.Refcountry, lambda i, row: dict( pk=i, country_pk=row['country_id'], ref_pk=row['refbase_id'])), ( 'spuriousreplacements', models2.Superseded, lambda i, row: dict( pk=i, languoid_pk=row['languoidbase_id'], replacement_pk=row['replacement_id'], description=row['relation'])), ( 'justification', common.ValueSetReference, lambda i, row: dict( pk=i, valueset_pk=valuesets['%s%s' % ( 'fc' if row['type'] == 'family' else 'sc', row['languoidbase_id'])], source_pk=row['refbase_id'], description=row['pages'])), ]: insert(db, table, model, value)
def main(args): stats = Counter(new=0, matches=0, migrations=0, nomatches=0) l, ll = Language.__table__.alias("l"), Languoid.__table__.alias("ll") gl_languoids = list(DBSession.execute(select([l, ll], use_labels=True).where(l.c.pk == ll.c.pk)).fetchall()) # we collect a list of changes which we will store in a JSON file. changes = [] hid_to_pk = {row["ll_hid"]: row["l_pk"] for row in gl_languoids if row["ll_hid"]} max_languoid_pk = max(*[row["l_pk"] for row in gl_languoids]) new_glottocodes = {} pk_to_name = {row["l_pk"]: row["l_name"] for row in gl_languoids} # dict mapping branches (i.e. tuples of sub-family names) to dicts of H-languages hh_families = OrderedDict() # dict mapping identifiers (i.e. hid) of H-languages to branches hh_languages = OrderedDict() parse_families(args.data_dir.joinpath("languoids", "lff.txt"), hh_families, hh_languages) # handle isolates / collapse families with exactly one leaf: isolate_names = {} collapsed_names = {} for key in hh_families.keys(): if len(hh_families[key]) == 1: if len(key) == 1: # isolate hh_languages[hh_families[key].keys()[0]][0] = None isolate_names[key[0]] = hh_families[key].keys()[0] # map name to code else: hh_languages[hh_families[key].keys()[0]][0] = key[:-1] collapsed_names[key[-1]] = hh_families[key].keys()[0] del hh_families[key] # now add the unclassifiabble, unattested, un-whatever parse_families(args.data_dir.joinpath("languoids", "lof.txt"), hh_families, hh_languages) # we also want to be able to lookup families by name fname_to_branches = defaultdict(list) for branch in hh_families: fname_to_branches[branch[-1]].append(branch) new_hid_to_pk = {} for code, (hnode, status, name) in hh_languages.items(): if code not in hid_to_pk: # we have to insert a new H-language! max_languoid_pk += 1 new_hid_to_pk[code] = max_languoid_pk if name in pk_to_name.values(): args.log.warn("new code {1} for existing name {0}".format(name, code)) changes.append( languoid( max_languoid_pk, "language", hid=code, id=glottocode(unicode(name), DBSession, new_glottocodes), name=name, hname=name, status=status, ) ) stats.update(["new_languages"]) duplicate_leafset_to_branch = {} leafset_to_branch = {} for family, langs in hh_families.items(): leafs = get_leafset(hid for hid in langs.keys() if hid in hid_to_pk) if not leafs: args.log.info("Family with only new languages: %s, %s" % (family, langs)) continue if leafs in leafset_to_branch: # so we have already seen this exact set of leaves. # # special case: there may be additional "Unclassified something" nodes in # branch without any changes in the set of leafs ... if not [n for n in family if n.startswith("Unclassified")]: # ... or the full leafset contains new languages assert [hid for hid in hh_families[family[:-1]].keys() if hid in new_hid_to_pk] fset, rset = set(family), set(leafset_to_branch[leafs]) assert rset.issubset(fset) assert leafs not in duplicate_leafset_to_branch duplicate_leafset_to_branch[leafs] = family else: leafset_to_branch[leafs] = family # # at this point leafset_to_branch is a consolidated mapping of sets of H-Languages # to branches in the new family tree. # # for set comparisons we compute a list of actual sets (not tuples) of leafs # ordered by length. leafsets = [set(t) for t in sorted(leafset_to_branch.keys(), key=lambda s: len(s))] todo = [] gl_family_to_leafset = {} def select_leafs(pk): l, tc = Languoid.__table__.alias("l"), TreeClosureTable.__table__.alias("tc") return [ r["l_hid"] for r in DBSession.execute( select([l, tc], use_labels=True).where( and_( l.c.pk == tc.c.child_pk, l.c.hid != None, l.c.status != LanguoidStatus.provisional, tc.c.parent_pk == pk, ) ) ) ] for row in gl_languoids: if row["ll_level"] == LanguoidLevel.family and row["l_active"]: leafs = get_leafset(select_leafs(row["l_pk"])) assert leafs glnode = GLNode( row["l_pk"], row["l_name"], row["ll_level"].name, row["ll_father_pk"], row["l_jsondata"].get("hname") ) gl_family_to_leafset[glnode] = leafs # note: for legacy gl nodes, we map leaf-tuples to lists of matching nodes! leafset_to_gl_family = defaultdict(list) for node, leafs in gl_family_to_leafset.items(): leafset_to_gl_family[leafs].append(node) # now we look for matches between old and new classification: for leafs, nodes in leafset_to_gl_family.items(): todo.extend( match_nodes(args, leafs, nodes, leafset_to_branch, duplicate_leafset_to_branch, leafsets, fname_to_branches) ) # compile a mapping for exact matches: branch_to_pk = {} for m in todo: if m.hid: if m.hid in branch_to_pk: if branch_to_pk[m.hid] != m.pk: # compare names: if pk_to_name[m.pk] == m.hid[-1]: args.log.info("#### type1") branch_to_pk[m.hid] = m.pk elif pk_to_name[branch_to_pk[m.hid]] == m.hid[-1]: args.log.info("#### type2") else: raise ValueError else: branch_to_pk[m.hid] = m.pk for hnode in sorted(hh_families.keys(), key=lambda b: (len(b), b)): # loop through branches breadth first to determine what's to be inserted if hnode not in branch_to_pk: t = get_leafset(hh_families[hnode].keys()) if t in leafset_to_gl_family: # the "Unclassified subfamily" special case from above: if not [n for n in hnode if n.startswith("Unclassified")]: assert [hid for hid in hh_families[hnode[:-1]].keys() if hid in new_hid_to_pk] # make sure, the existing glottolog family for the set of leafs is mapped # to some other node in the new classification: assert leafset_to_gl_family[t][0].pk in [m.pk for m in todo if m.hid] max_languoid_pk += 1 branch_to_pk[hnode] = max_languoid_pk pk_to_name[max_languoid_pk] = hnode[-1] attrs = languoid( max_languoid_pk, "family", id=glottocode(unicode(hnode[-1]), DBSession, new_glottocodes), name=hnode[-1], hname=hnode[-1], ) if len(hnode) > 1: attrs["father_pk"] = branch_to_pk[tuple(list(hnode)[:-1])] assert attrs["father_pk"] stats.update(["new"]) changes.append(attrs) # now on to the updates for families: for m in todo: attrs = languoid(m.pk, "family", name=pk_to_name[m.pk]) if m.hid: stats.update(["matches"]) if len(m.hid) > 1: attrs["father_pk"] = branch_to_pk[tuple(list(m.hid)[:-1])] if getattr(m, "rename", False): attrs["name"] = m.hid[-1] attrs["hname"] = m.hid[-1] else: attrs["active"] = False # mark the languoid as obsolete. if getattr(m, "pointer", False): print "~~", m.pk, pk_to_name[m.pk].encode("utf8"), "->", ", ".join(m.pointer).encode("utf8") stats.update(["migrations"]) attrs["replacement"] = branch_to_pk[m.pointer] else: stats.update(["nomatches"]) changes.append(attrs) args.log.info("%s" % stats) risolate_names = dict(zip(isolate_names.values(), isolate_names.keys())) rcollapsed_names = dict(zip(collapsed_names.values(), collapsed_names.keys())) # and updates of father_pks for languages: for l, (hnode, status, name) in hh_languages.items(): id_ = hid_to_pk.get(l) if not id_: id_ = new_hid_to_pk.get(l) attrs = languoid(id_, "language", status=status) else: attrs = languoid(id_, "language", status=status) # In case of existing languoids, we don't change the active flag! del attrs["active"] if id_ in pk_to_name and name != pk_to_name[id_]: if slug(pk_to_name[id_]) == slug(name): attrs["name"] = name if hnode: attrs["father_pk"] = branch_to_pk[hnode] # look for hnames! if l in risolate_names: attrs["hname"] = risolate_names[l] if l in rcollapsed_names: attrs["hname"] = rcollapsed_names[l] changes.append(attrs) for row in gl_languoids: hid = row["ll_hid"] if hid and "NOCODE" in hid and hid not in hh_languages: # languoids with Harald's private code that are no longer in use changes.append(languoid(row["l_pk"], "language", status="retired", active=False, father_pk=None)) jsondump(changes, args.data_dir.joinpath("languoids", "changes.json"), indent=4)
def update(args): author = 'ISO 639-3 Registration Authority' pid = 'iso6393' dtid = 'overview' dt = Doctype.get(dtid) provider = Provider.get(pid, default=None) if provider is None: provider = Provider( id=pid, abbr=pid, name=author, description="Change requests submitted to the ISO 639-3 registration authority.") iid = max(int(DBSession.execute( "select max(cast(id as integer)) from source").fetchone()[0]), 500000) pk = int(DBSession.execute("select max(pk) from source").fetchone()[0]) for crno, affected in args.json['changerequests'].items(): year, serial = crno.split('-') title = 'Change Request Number %s' % crno ref = Ref.get(title, key='title', default=None) if not ref: iid += 1 pk += 1 ref = Ref( pk=pk, id=str(iid), name='%s %s' % (author, year), bibtex_type=EntryType.misc, number=crno, description=title, year=year, year_int=int(year), title=title, author=author, address='Dallas', publisher='SIL International', url='http://www.sil.org/iso639-3/cr_files/%s.pdf' % crno, doctypes_str=dtid, providers_str=pid, language_note=', '.join('%(Language Name)s [%(Affected Identifier)s]' % spec for spec in affected), jsondata=dict(hhtype=dtid, src=pid)) ref.doctypes.append(dt) ref.providers.append(provider) for spec in affected: lang = Languoid.get(spec['Affected Identifier'], key='hid', default=None) if lang and lang not in ref.languages: ref.languages.append(lang) DBSession.add(ref) transaction.commit() transaction.begin() matched = 0 near = 0 max_identifier_pk = DBSession.query( Identifier.pk).order_by(desc(Identifier.pk)).first()[0] families = [] for family in DBSession.query(Languoid)\ .filter(Languoid.level == LanguoidLevel.family)\ .filter(Language.active == True)\ .all(): isoleafs = set() for row in DBSession.query(TreeClosureTable.child_pk, Languoid.hid)\ .filter(family.pk == TreeClosureTable.parent_pk)\ .filter(Languoid.pk == TreeClosureTable.child_pk)\ .filter(Languoid.hid != None)\ .filter(Languoid.level == LanguoidLevel.language)\ .filter(Languoid.status == LanguoidStatus.established)\ .all(): if len(row[1]) == 3: isoleafs.add(row[1]) families.append((family, isoleafs)) families = sorted(families, key=lambda p: len(p[1])) for mid, leafs in args.json['macrolanguages'].items(): leafs = set(leafs) found = False for family, isoleafs in families: if leafs == isoleafs: if mid not in [c.name for c in family.identifiers if c.type == IdentifierType.iso.value]: family.codes.append(Identifier( id=str(max_identifier_pk + 1), name=mid, type=IdentifierType.iso.value)) max_identifier_pk += 1 matched += 1 found = True break elif leafs.issubset(isoleafs): print '~~~', family.name, '-->', mid, 'distance:', len(leafs), len(isoleafs) near += 1 found = True break if not found: print '---', mid, leafs print 'matched', matched, 'of', len(args.json['macrolanguages']), 'macrolangs' print near
def prime(args): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodically whenever data has been updated. """ recreate_treeclosure() for lpk, mas in DBSession.execute("""\ select l.pk, array_agg(distinct lma.macroarea_pk) from language as l, treeclosuretable as t, languoidmacroarea as lma, macroarea as ma where l.pk = t.parent_pk and t.child_pk = lma.languoid_pk and lma.macroarea_pk = ma.pk and l.pk not in (select languoid_pk from languoidmacroarea) group by l.pk"""): for mapk in mas: DBSession.add(models.Languoidmacroarea(languoid_pk=lpk, macroarea_pk=mapk)) for row in list(DBSession.execute( "select pk, pages, pages_int, startpage_int from source where pages_int < 0" )): pk, pages, number, start = row _start, _end, _number = compute_pages(pages) if _number > 0 and _number != number: DBSession.execute( "update source set pages_int = %s, startpage_int = %s where pk = %s" % (_number, _start, pk)) DBSession.execute( "update ref set endpage_int = %s where pk = %s" % (_end, pk)) with jsonlib.update(gc2version(args), indent=4) as legacy: for lang in DBSession.query(common.Language): if lang.id not in legacy: lang.update_jsondata(new=True) legacy[lang.id] = args.args[0] def items(s): if not s: return set() r = [] for ss in set(s.strip().split()): if '**:' in ss: ss = ss.split('**:')[0] + '**' if ss.endswith(','): ss = ss[:-1].strip() r.append(ss) return set(r) refs = { r[0]: r[1] for r in DBSession.query(models.Refprovider.id, models.Refprovider.ref_pk)} valuesets = { r[0]: r[1] for r in DBSession.query(common.ValueSet.id, common.ValueSet.pk)} for lang in args.repos.languoids(): if lang.category == models.BOOKKEEPING: continue clf = lang.classification_comment if clf: if clf.subrefs: if items(lang.cfg['classification']['subrefs']) != \ items(lang.cfg['classification'].get('sub')): vspk = valuesets['sc-{0}'.format(lang.id)] for ref in clf.subrefs: spk = refs.get(ref.key) DBSession.add( common.ValueSetReference(source_pk=spk, valueset_pk=vspk))
def update(args): author = 'ISO 639-3 Registration Authority' pid = 'iso6393' dtid = 'overview' dt = Doctype.get(dtid) provider = Provider.get(pid, default=None) if provider is None: provider = Provider( id=pid, abbr=pid, name=author, description= "Change requests submitted to the ISO 639-3 registration authority." ) iid = max( int( DBSession.execute( "select max(cast(id as integer)) from source").fetchone()[0]), 500000) pk = int(DBSession.execute("select max(pk) from source").fetchone()[0]) for crno, affected in args.json['changerequests'].items(): year, serial = crno.split('-') title = 'Change Request Number %s' % crno ref = Ref.get(title, key='title', default=None) if not ref: iid += 1 pk += 1 ref = Ref(pk=pk, id=str(iid), name='%s %s' % (author, year), bibtex_type=EntryType.misc, number=crno, description=title, year=year, year_int=int(year), title=title, author=author, address='Dallas', publisher='SIL International', url='http://www.sil.org/iso639-3/cr_files/%s.pdf' % crno, language_note=', '.join( '%(Language Name)s [%(Affected Identifier)s]' % spec for spec in affected), jsondata=dict(hhtype=dtid, src=pid)) ref.doctypes.append(dt) ref.providers.append(provider) for spec in affected: lang = Languoid.get(spec['Affected Identifier'], key='hid', default=None) if lang and lang not in ref.languages: ref.languages.append(lang) DBSession.add(ref) transaction.commit() transaction.begin() matched = 0 near = 0 max_identifier_pk = DBSession.query(Identifier.pk).order_by( desc(Identifier.pk)).first()[0] families = [] for family in DBSession.query(Languoid)\ .filter(Languoid.level == LanguoidLevel.family)\ .filter(Language.active == True)\ .all(): isoleafs = set() for row in DBSession.query(TreeClosureTable.child_pk, Languoid.hid)\ .filter(family.pk == TreeClosureTable.parent_pk)\ .filter(Languoid.pk == TreeClosureTable.child_pk)\ .filter(Languoid.hid != None)\ .filter(Languoid.level == LanguoidLevel.language)\ .filter(Languoid.status == LanguoidStatus.established)\ .all(): if len(row[1]) == 3: isoleafs.add(row[1]) families.append((family, isoleafs)) families = sorted(families, key=lambda p: len(p[1])) for mid, leafs in args.json['macrolanguages'].items(): leafs = set(leafs) found = False for family, isoleafs in families: if leafs == isoleafs: if mid not in [ c.name for c in family.identifiers if c.type == IdentifierType.iso.value ]: family.codes.append( Identifier(id=str(max_identifier_pk + 1), name=mid, type=IdentifierType.iso.value)) max_identifier_pk += 1 matched += 1 found = True break elif leafs.issubset(isoleafs): print '~~~', family.name, '-->', mid, 'distance:', len( leafs), len(isoleafs) near += 1 found = True break if not found: print '---', mid, leafs print 'matched', matched, 'of', len( args.json['macrolanguages']), 'macrolangs' print near
def main(args): data = Data(created=utc.localize(datetime(2013, 11, 15)), updated=utc.localize(datetime(2013, 12, 12))) icons = issues.Icons() #print icons DBSession.execute("delete from Language") DBSession.execute("delete from Unit") DBSession.execute("delete from featuredomain") DBSession.execute("delete from family") DBSession.execute("delete from source") DBSession.execute("delete from parameter") DBSession.execute("delete from feature") DBSession.execute("delete from domainelement") DBSession.execute("delete from valueset") DBSession.execute("delete from value") DBSession.execute("delete from lsivalue") DBSession.execute("delete from dataset") DBSession.execute("delete from contributor") DBSession.execute("delete from lsilanguage") DBSession.execute("delete from contribution") DBSession.execute("delete from designer") DBSession.flush() dtab = partial(_dtab, args.data_file()) #Languages #print args.data_file() #tabfns = ['%s' % fn.basename() for fn in args.data_file().files('nts_*.tab')] #tabfns = ['nts_18.tab'] ##tabfns = os.listdir('/Users/virk/shafqat/postDoc-Swe/project/clld/clld/lsi/lsi/data')[1:] tabfns = os.listdir( '/Users/virk/shafqat/postDoc-Swe/project/clld/clld/lsi/lsi/lsi_data' )[1:] #print tabfns args.log.info("Sheets found: %s" % tabfns) ldps = [] lgs = {} nfeatures = Counter() nlgs = Counter() for fn in tabfns: for ld in dtab(fn): if ld['language_id'] == 'qgr' or ld['language_id'] == '---' or ld[ 'language_id'] == '': # to exclude languages which do not have an iso-code continue if "feature_alphanumid" not in ld: args.log.info("NO FEATUREID %s %s" % (len(ld), ld)) if not ld["feature_alphanumid"].startswith("DRS") \ and ld["feature_alphanumid"].find(".") == -1: ldps.append(dp_dict(ld)) ##print ld lgs[ld['language_id']] = unescape(ld['language_name']) if ld["value"] != "?": nfeatures.update([ld['language_id']]) nlgs.update([ld['feature_alphanumid']]) ldps = sorted(ldps, key=lambda d: d['feature_alphanumid']) #lgs["ygr"] = "Hua" for lgid, lgname in lgs.items(): data.add(models.lsiLanguage, lgid, id=lgid, name=lgname, representation=nfeatures.get(lgid, 0)) DBSession.flush() #print "I am here" #print data['ntsLanguage'].values()[1].id load_families( data, ##[(NOCODE_TO_GLOTTOCODE.get(l.id, l.id), l) for l in data['lsiLanguage'].values()], [(NOCODE_TO_GLOTTOCODE.get(l.id, l.id), l) for l in data['lsiLanguage'].values() if l.id != '---' and l.id != ''], isolates_icon='tcccccc') #print 'family' #print data['Family'].get('sino1245').jsondata #Domains for domain in set(ld['feature_domain'] for ld in ldps): data.add(models.FeatureDomain, domain, name=domain) DBSession.flush() #Designers #for i, info in enumerate(dtab("ntscontributions.tab") + dtab("ntscontacts.tab")): for i, info in enumerate([{ 'designer': 'shafqat', 'domain': '', 'pdflink': '', 'citation': '' }, { 'designer': '-', 'domain': '', 'pdflink': '', 'citation': '' }]): designer_id = str(i + 1) data.add(models.Designer, info['designer'], id=designer_id, name=designer_id, domain=info["domain"], contributor=info['designer'], pdflink=info["pdflink"], citation=info["citation"]) DBSession.flush() #Sources '''for k, (typ, bibdata) in [ ktfbib(bibsource) for ld in ldps if ld.get(u'bibsources') for bibsource in ld['bibsources'].split(",,,") ]: if k not in data["Source"]: data.add(common.Source, k, _obj=bibtex2source(Record(typ, k, **bibdata))) DBSession.flush()''' #Features fs = [(fid, mergeds(lds)) for fid, lds in groupby(ldps, key=lambda d: d['feature_alphanumid'])] fvdesc = [(fid, [ (ld.get("feature_possible_values"), ld.get("fromfile")) for ld in lds if ld.get("feature_possible_values") ]) for fid, lds in groupby(ldps, key=lambda d: d['feature_alphanumid'])] fvdt = [(fid, grp2(vdescs)) for (fid, vdescs) in fvdesc] fvmis = [(fid, vdescs) for (fid, vdescs) in fvdt if len(vdescs) > 1] for _, dfsids in groupby(sorted( (f.get('feature_name', fid), fid) for fid, f in fs), key=lambda t: t[0]): ##print [(k,v) for (k,v) in list(dfsids)],len(list(dfsids)) assert len(list(dfsids)) == 1 #print 'here is nlgs' for fid, f in fs: #print "lang name" #print ldps #print f.get('feature_possible_values', ""), if not fid.isdigit(): args.log.info("NO INT FID %s" % f) feature = data.add( models.Feature, fid, id=fid, name=f.get('feature_name', f['feature_alphanumid']), doc=f.get('feature_information', ""), vdoc=f.get('feature_possible_values', ""), representation=nlgs.get(fid, 0), designer=data["Designer"][f['designer']], dependson=f.get("depends_on", ""), abbreviation=f.get("abbreviation", ""), featuredomain=data['FeatureDomain'][f["feature_domain"]], name_french=f.get('francais', ""), clarification=f.get( "draft of clarifying comments to outsiders (hedvig + dunn + harald + suzanne)", ""), alternative_id=f.get("old feature number", ""), jl_relevant_unit=f.get("relevant unit(s)", ""), jl_function=f.get("function", ""), jl_formal_means=f.get("formal means", ""), sortkey_str="", sortkey_int=int(fid)) vdesclist = [veq.split("==") for veq in feature.vdoc.split("||")] vdesc = {v.replace(".", "-"): desc for [v, desc] in vdesclist} ##vdesc = {fmly+val2icon(v): desc for ((v,desc),fmly) in itertools.product([(vv,desc) for [vv, desc] in vdesclist],['c','d','f','t'])} vdesc.setdefault('?', 'Not known') if 'N/A' not in vdesc and feature.dependson: vdesc["N/A"] = "Not Applicable" vi = {v: i for (i, v) in enumerate(sorted(vdesc.keys()))} ##vicons = {f+val2icon(v):f+val2icon(v) for (v,f) in itertools.product(['0','1','2','3'],['c','d','f','t'])} ##vicons['?'] = 'c00ffff' ##vicons['N/A'] = 'c00ffff' ##vicons = icons.iconize(vi.keys()) for (v, desc) in vdesc.items(): #print v,vicons[v] data.add(common.DomainElement, (fid, v), id='%s-%s' % (fid, v), name=v, description=desc, jsondata={"icon": Colors[v]}, number=vi[v], parameter=feature) DBSession.flush() for ((f, lg), ixs) in grp2([((ld['feature_alphanumid'], ld['language_id']), i) for i, ld in enumerate(ldps)]): ixvs = set([ldps[ix]['value'] for ix in ixs]) if len(ixvs) == 1: continue args.log.warn("Dup value %s %s %s" % (f, lg, [(ldps[ix]['value'], ldps[ix]['fromfile']) for ix in ixs])) ##print "Dup value %s %s %s" % (f, lg, [(ldps[ix]['value'], ldps[ix]['fromfile'], ldps[ix].get('provenance')) for ix in ixs]) errors = {} done = set() glottolog = Glottolog() for ld in ldps: ############################### for printing different map markers for different familys for features:shafqat #print data['Family'] language = data['lsiLanguage'][ld['language_id']] if isinstance(language, (tuple, list)) and len(language) == 2: code, language = language else: code = language.id if code != '-': gl_language = glottolog.languoid(code) if gl_language: gl_family = gl_language.family if gl_family: family = data['Family'].get(gl_family.id) ##ld['value'] = ld['value']+'-'+str(family) ##ld['value'] = combineValueFamily(ld['value'],str(family)) #print family ##################################### parameter = data['Feature'][ld['feature_alphanumid']] language = data['lsiLanguage'][ld['language_id']] id_ = '%s-%s' % (parameter.id, language.id) if id_ in done: continue if (ld['feature_alphanumid'], ld['value']) not in data['DomainElement']: if not ld["value"].strip(): continue info = (ld['feature_alphanumid'], ld.get('feature_name', "[Feature Name Lacking]"), ld['language_id'], ld['value'], ld['fromfile']) msg = u"%s %s %s %s %s not in the set of legal values ({0})" % info args.log.error( msg.format( sorted([ y for (x, y) in data['DomainElement'].keys() if x == ld['feature_alphanumid'] ]))) ##print msg.format(sorted( ## [y for (x, y) in data['DomainElement'].keys() ## if x == ld['feature_alphanumid']])) errors[(ld['feature_alphanumid'], ld['language_id'])] = info continue vs = common.ValueSet( id=id_, language=language, parameter=parameter, source=ld["source"] or None, ##contribution=parameter.designer ) #print #print "this one" #print ld['value'],family models.lsiValue( id=id_, domainelement=data['DomainElement'][(ld['feature_alphanumid'], ld['value'])], jsondata={ "icon": data['DomainElement'][(ld['feature_alphanumid'], ld['value'])].jsondata, "family": FamilyCodes[str(family)] }, comment=ld["comment"], valueset=vs, contributed_datapoint=ld["contributor"]) done.add(id_) '''if not ld.get('bibsources'): if 'bibsources' not in ld: args.log.warn("no bibsource %s" % ld) continue for k, _ in [ktfbib(bibsource) for bibsource in ld['bibsources'].split(",,,")]: common.ValueSetReference(valueset=vs, source=data['Source'][k])''' DBSession.flush() #To CLDF cldf = {} for ld in ldps: parameter = data['Feature'][ld['feature_alphanumid']] language = data['lsiLanguage'][ld['language_id']] id_ = '%s-%s' % (parameter.id, language.id) if not id_ in done: continue dt = (lgs[ld['language_id']], ld['language_id'], ld['feature_alphanumid'] + ". " + ld['feature_name'], ld["value"]) cldf[dt] = None tab = lambda rows: u''.join([u'\t'.join(row) + u"\n" for row in rows]) savu(tab([("Language", "iso-639-3", "Feature", "Value")] + cldf.keys()), "lsi.cldf") args.log.info('%s Errors' % len(errors)) dataset = common.Dataset( id="LSI", name='Linguistic Survey of India', publisher_name="Sprakbanken", publisher_place="Gothenburg", publisher_url="to be given", description="this is to be followed", domain='http://lsi.clld.org', published=date(2016, 05, 16), contact='*****@*****.**', license='http://creativecommons.org/licenses/by-nc-nd/2.0/de/deed.en', jsondata={ 'license_icon': 'http://wals.info/static/images/cc_by_nc_nd.png', 'license_name': 'Creative Commons Attribution-NonCommercial-NoDerivs 2.0 Germany' }) # disabled for experimental purposes, names were appearing multiple times for i, contributor in enumerate([ common.Contributor(id="Lars Borin", name="Lars Borin", email="*****@*****.**"), common.Contributor(id="Shafqat Mumtaz Virk", name="Shafqat Mumtaz Virk", email="*****@*****.**"), common.Contributor(id="Anju Saxena", name="Anju Saxena", email="*****@*****.**"), common.Contributor(id="Harald Hammarstrom", name="Harald Hammarstrom", email="*****@*****.**") ]): #print i common.Editor(dataset=dataset, contributor=contributor, ord=i) '''cont1 = common.Contributor( id="Harald Hammarstrom", name="Harald Hammarstrom", email="*****@*****.**") cont2= common.Contributor( id="Shafqat Mumtaz Virk", name="Shafqat Mumtaz Virk", email="*****@*****.**") cont3 = common.Contributor( id="Lars Borin", name="Lars Borin", email="*****@*****.**") for contributor in [cont1,cont2,cont3]: common.Editor(dataset=dataset, contributor=contributor,ord=1)''' DBSession.add(dataset) DBSession.flush()