def main(args): active_only = not args.all coords = dict( (r[0], r[1:]) for r in dsv.rows(args.data_file('coordinates.tab'))) codes = dict((row[0], row[1]) for row in DBSession.execute( "select ll.hid, l.pk from languoid as ll, language as l where ll.pk = l.pk and ll.hid is not null" )) maxid = DBSession.execute( "select pk from languoid order by pk desc limit 1").fetchone()[0] gcs = {} lnames = {} for row in DBSession.execute("select pk, name from language"): lnames[row[0]] = row[1] # dict mapping branches (i.e. tuples of sub-family names) to dicts of H-languages families = OrderedDict() # dict mapping identifiers of H-languages to branches languages = OrderedDict() parse_families(args.data_file('lff.txt'), families, languages) # handle isolates / collapse families with exactly one leaf: isolate_names = {} collapsed_names = {} for key in families.keys(): if len(families[key]) == 1: if len(key) == 1: # isolate languages[families[key].keys()[0]][0] = None isolate_names[key[0]] = families[key].keys()[ 0] # map name to code else: languages[families[key].keys()[0]][0] = key[:-1] collapsed_names[key[-1]] = families[key].keys()[0] del families[key] # we also want to be able to lookup families by name names = {} for branch in families: name = branch[-1] if name in names: names[name].append(branch) else: names[name] = [branch] # now add the unclassifiabble, unattested, un-whatever parse_families(args.data_file('lof.txt'), families, languages) ncodes = {} languoids = [] for code in languages: if code not in codes: maxid += 1 ncodes[code] = maxid hnode, status, name, comment = languages[code] # we have to insert a new H-language! attrs = languoid( maxid, 'language', hid=code, id=glottocode(unicode(name), DBSession, gcs), name=name, hname=name, status=status, globalclassificationcomment=comment or None, ) print '++', attrs if coords.get(code): attrs['longitude'], attrs['latitude'] = map( float, coords.get(code)) languoids.append(attrs) urnodes = {} rnodes = {} for family in families: leafs = families[family] assert family[0] not in ['Speech Register', 'Spurious'] leafs = tuple( sorted(code for code in families[family].keys() if code in codes)) assert leafs if leafs in rnodes: # special case: there may be additional "Unclassified something" nodes in # branch without any changes in the set of leafs. assert [n for n in family if n.startswith('Unclassified')] fset, rset = set(family), set(rnodes[leafs]) assert rset.issubset(fset) assert leafs not in urnodes urnodes[leafs] = family #if len(family) > rnodes[leafs]: # rnodes[leafs] = family else: rnodes[leafs] = family # # at this point rnodes is a consolidated mapping of sets of H-Languages to branches in # the family tree. # # for set comparisons we compute a list of actual sets of leafs as well leafsets = [set(t) for t in sorted(rnodes.keys(), key=lambda s: len(s))] todo = [] # dict mapping (id, name, level) tuples for gl languoids of level family to tuples of leafs glnodes = {} # # note: all languoids with level null have children, thus are not dialects! # sql = "select l.pk, l.name, ll.level, ll.father_pk from languoid as ll, language as l where ll.pk = l.pk and ll.level = 'family' or ll.level is null" if active_only: sql = "select l.pk, l.name, ll.level, ll.father_pk from languoid as ll, language as l where ll.pk = l.pk and ll.level = 'family' and l.active = true" for row in DBSession.execute(sql).fetchall(): leafs = [ r[0] for r in DBSession.execute( "select distinct l.hid from treeclosuretable as t, languoid as l where t.child_pk = l.pk and t.parent_pk = %s and l.hid is not null and l.status != 'provisional'" % row[0]) ] if leafs: glnodes[(row[0], row[2], row[1], row[3])] = tuple(sorted(leafs)) else: # families without leafs will be marked as retired if row[1] in names and len(names[row[1]]) == 1: # unique family name, good enough for a match!? todo.append(Migration(row[0], None, pointer=names[row[1]][0])) else: todo.append(Migration(row[0], None)) # note: for legacy gl nodes, we map leaf-tuples to lists of matching nodes! rglnodes = {} for node, leafs in glnodes.items(): if leafs in rglnodes: rglnodes[leafs].append(node) else: rglnodes[leafs] = [node] # now we look for matches between old and new classification: for leafs, nodes in rglnodes.items(): assert leafs assert nodes todo.extend(match_nodes(leafs, nodes, rnodes, urnodes, leafsets, names)) # compile a mapping for exact matches: branch_to_pk = {} for m in todo: if m.hid: assert m.hid not in branch_to_pk branch_to_pk[m.hid] = m.pk new = 0 for hnode in sorted(families.keys(), key=lambda b: (len(b), b)): # loop through branches breadth first to determine what's to be inserted if hnode not in branch_to_pk: t = tuple(sorted(families[hnode].keys())) if t in rglnodes: # the "Unclassified subfamily" special case from above: assert [n for n in hnode if n.startswith('Unclassified')] # make sure, the existing glottolog family for the set of leafs is mapped # to some other node in the new classification: assert rglnodes[t][0][0] in [m.pk for m in todo if m.hid] maxid += 1 attrs = languoid( maxid, 'family', id=glottocode(unicode(hnode[-1]), DBSession, gcs), name=hnode[-1], hname=hnode[-1], ) branch_to_pk[hnode] = maxid lnames[maxid] = hnode[-1] if len(hnode) > 1: attrs['father_pk'] = branch_to_pk[tuple(list(hnode)[:-1])] assert attrs['father_pk'] print '++', attrs new += 1 languoids.append(attrs) # now on to the updates for families: matches, migrations, nomatches = 0, 0, 0 for m in todo: attrs = languoid(m.pk, 'family', name=lnames[m.pk]) if m.hid: #print '==', lnames[m.pk].encode('utf8'), '->', ', '.join(m.hid).encode('utf8') matches += 1 if len(m.hid) > 1: attrs['father_pk'] = branch_to_pk[tuple(list(m.hid)[:-1])] if getattr(m, 'rename', False): attrs['name'] = m.hid[-1] attrs['hname'] = m.hid[-1] else: attrs['active'] = False if getattr(m, 'pointer', False): print '~~', lnames[m.pk].encode('utf8'), '->', ', '.join( m.pointer).encode('utf8') migrations += 1 attrs['replacement'] = branch_to_pk[m.pointer] else: print '--', lnames[m.pk].encode('utf8'), '->' nomatches += 1 languoids.append(attrs) print matches, 'matches' print migrations, 'migrations' print nomatches, 'nomatches' print new, 'new nodes' risolate_names = dict(zip(isolate_names.values(), isolate_names.keys())) rcollapsed_names = dict( zip(collapsed_names.values(), collapsed_names.keys())) # and updates of father_pks for languages: for l in languages: hnode, status, name, comment = languages[l] id_ = codes.get(l, ncodes.get(l)) attrs = languoid(id_, 'language', status=status) if hnode: attrs['father_pk'] = branch_to_pk[hnode] attrs['globalclassificationcomment'] = comment or None # look for hnames! if l in risolate_names: attrs['hname'] = risolate_names[l] if l in rcollapsed_names: attrs['hname'] = rcollapsed_names[l] languoids.append(attrs) for row in DBSession.execute( "select l.pk, ll.hid, l.name from languoid as ll, language as l where ll.pk = l.pk and ll.hid like '%NOCODE_%'" ).fetchall(): if row[1] not in languages: # languoids with Harald's private code that are no longer in use attrs = languoid(row[0], 'language', status='retired', active=False, father_pk=None) languoids.append(attrs) with open(args.data_file('languoids.json'), 'w') as fp: json.dump(languoids, fp)
def main(args): stats = Counter(new=0, matches=0, migrations=0, nomatches=0) l, ll = Language.__table__.alias("l"), Languoid.__table__.alias("ll") gl_languoids = list(DBSession.execute(select([l, ll], use_labels=True).where(l.c.pk == ll.c.pk)).fetchall()) # we collect a list of changes which we will store in a JSON file. changes = [] hid_to_pk = {row["ll_hid"]: row["l_pk"] for row in gl_languoids if row["ll_hid"]} max_languoid_pk = max(*[row["l_pk"] for row in gl_languoids]) new_glottocodes = {} pk_to_name = {row["l_pk"]: row["l_name"] for row in gl_languoids} # dict mapping branches (i.e. tuples of sub-family names) to dicts of H-languages hh_families = OrderedDict() # dict mapping identifiers (i.e. hid) of H-languages to branches hh_languages = OrderedDict() parse_families(args.data_dir.joinpath("languoids", "lff.txt"), hh_families, hh_languages) # handle isolates / collapse families with exactly one leaf: isolate_names = {} collapsed_names = {} for key in hh_families.keys(): if len(hh_families[key]) == 1: if len(key) == 1: # isolate hh_languages[hh_families[key].keys()[0]][0] = None isolate_names[key[0]] = hh_families[key].keys()[0] # map name to code else: hh_languages[hh_families[key].keys()[0]][0] = key[:-1] collapsed_names[key[-1]] = hh_families[key].keys()[0] del hh_families[key] # now add the unclassifiabble, unattested, un-whatever parse_families(args.data_dir.joinpath("languoids", "lof.txt"), hh_families, hh_languages) # we also want to be able to lookup families by name fname_to_branches = defaultdict(list) for branch in hh_families: fname_to_branches[branch[-1]].append(branch) new_hid_to_pk = {} for code, (hnode, status, name) in hh_languages.items(): if code not in hid_to_pk: # we have to insert a new H-language! max_languoid_pk += 1 new_hid_to_pk[code] = max_languoid_pk if name in pk_to_name.values(): args.log.warn("new code {1} for existing name {0}".format(name, code)) changes.append( languoid( max_languoid_pk, "language", hid=code, id=glottocode(unicode(name), DBSession, new_glottocodes), name=name, hname=name, status=status, ) ) stats.update(["new_languages"]) duplicate_leafset_to_branch = {} leafset_to_branch = {} for family, langs in hh_families.items(): leafs = get_leafset(hid for hid in langs.keys() if hid in hid_to_pk) if not leafs: args.log.info("Family with only new languages: %s, %s" % (family, langs)) continue if leafs in leafset_to_branch: # so we have already seen this exact set of leaves. # # special case: there may be additional "Unclassified something" nodes in # branch without any changes in the set of leafs ... if not [n for n in family if n.startswith("Unclassified")]: # ... or the full leafset contains new languages assert [hid for hid in hh_families[family[:-1]].keys() if hid in new_hid_to_pk] fset, rset = set(family), set(leafset_to_branch[leafs]) assert rset.issubset(fset) assert leafs not in duplicate_leafset_to_branch duplicate_leafset_to_branch[leafs] = family else: leafset_to_branch[leafs] = family # # at this point leafset_to_branch is a consolidated mapping of sets of H-Languages # to branches in the new family tree. # # for set comparisons we compute a list of actual sets (not tuples) of leafs # ordered by length. leafsets = [set(t) for t in sorted(leafset_to_branch.keys(), key=lambda s: len(s))] todo = [] gl_family_to_leafset = {} def select_leafs(pk): l, tc = Languoid.__table__.alias("l"), TreeClosureTable.__table__.alias("tc") return [ r["l_hid"] for r in DBSession.execute( select([l, tc], use_labels=True).where( and_( l.c.pk == tc.c.child_pk, l.c.hid != None, l.c.status != LanguoidStatus.provisional, tc.c.parent_pk == pk, ) ) ) ] for row in gl_languoids: if row["ll_level"] == LanguoidLevel.family and row["l_active"]: leafs = get_leafset(select_leafs(row["l_pk"])) assert leafs glnode = GLNode( row["l_pk"], row["l_name"], row["ll_level"].name, row["ll_father_pk"], row["l_jsondata"].get("hname") ) gl_family_to_leafset[glnode] = leafs # note: for legacy gl nodes, we map leaf-tuples to lists of matching nodes! leafset_to_gl_family = defaultdict(list) for node, leafs in gl_family_to_leafset.items(): leafset_to_gl_family[leafs].append(node) # now we look for matches between old and new classification: for leafs, nodes in leafset_to_gl_family.items(): todo.extend( match_nodes(args, leafs, nodes, leafset_to_branch, duplicate_leafset_to_branch, leafsets, fname_to_branches) ) # compile a mapping for exact matches: branch_to_pk = {} for m in todo: if m.hid: if m.hid in branch_to_pk: if branch_to_pk[m.hid] != m.pk: # compare names: if pk_to_name[m.pk] == m.hid[-1]: args.log.info("#### type1") branch_to_pk[m.hid] = m.pk elif pk_to_name[branch_to_pk[m.hid]] == m.hid[-1]: args.log.info("#### type2") else: raise ValueError else: branch_to_pk[m.hid] = m.pk for hnode in sorted(hh_families.keys(), key=lambda b: (len(b), b)): # loop through branches breadth first to determine what's to be inserted if hnode not in branch_to_pk: t = get_leafset(hh_families[hnode].keys()) if t in leafset_to_gl_family: # the "Unclassified subfamily" special case from above: if not [n for n in hnode if n.startswith("Unclassified")]: assert [hid for hid in hh_families[hnode[:-1]].keys() if hid in new_hid_to_pk] # make sure, the existing glottolog family for the set of leafs is mapped # to some other node in the new classification: assert leafset_to_gl_family[t][0].pk in [m.pk for m in todo if m.hid] max_languoid_pk += 1 branch_to_pk[hnode] = max_languoid_pk pk_to_name[max_languoid_pk] = hnode[-1] attrs = languoid( max_languoid_pk, "family", id=glottocode(unicode(hnode[-1]), DBSession, new_glottocodes), name=hnode[-1], hname=hnode[-1], ) if len(hnode) > 1: attrs["father_pk"] = branch_to_pk[tuple(list(hnode)[:-1])] assert attrs["father_pk"] stats.update(["new"]) changes.append(attrs) # now on to the updates for families: for m in todo: attrs = languoid(m.pk, "family", name=pk_to_name[m.pk]) if m.hid: stats.update(["matches"]) if len(m.hid) > 1: attrs["father_pk"] = branch_to_pk[tuple(list(m.hid)[:-1])] if getattr(m, "rename", False): attrs["name"] = m.hid[-1] attrs["hname"] = m.hid[-1] else: attrs["active"] = False # mark the languoid as obsolete. if getattr(m, "pointer", False): print "~~", m.pk, pk_to_name[m.pk].encode("utf8"), "->", ", ".join(m.pointer).encode("utf8") stats.update(["migrations"]) attrs["replacement"] = branch_to_pk[m.pointer] else: stats.update(["nomatches"]) changes.append(attrs) args.log.info("%s" % stats) risolate_names = dict(zip(isolate_names.values(), isolate_names.keys())) rcollapsed_names = dict(zip(collapsed_names.values(), collapsed_names.keys())) # and updates of father_pks for languages: for l, (hnode, status, name) in hh_languages.items(): id_ = hid_to_pk.get(l) if not id_: id_ = new_hid_to_pk.get(l) attrs = languoid(id_, "language", status=status) else: attrs = languoid(id_, "language", status=status) # In case of existing languoids, we don't change the active flag! del attrs["active"] if id_ in pk_to_name and name != pk_to_name[id_]: if slug(pk_to_name[id_]) == slug(name): attrs["name"] = name if hnode: attrs["father_pk"] = branch_to_pk[hnode] # look for hnames! if l in risolate_names: attrs["hname"] = risolate_names[l] if l in rcollapsed_names: attrs["hname"] = rcollapsed_names[l] changes.append(attrs) for row in gl_languoids: hid = row["ll_hid"] if hid and "NOCODE" in hid and hid not in hh_languages: # languoids with Harald's private code that are no longer in use changes.append(languoid(row["l_pk"], "language", status="retired", active=False, father_pk=None)) jsondump(changes, args.data_dir.joinpath("languoids", "changes.json"), indent=4)
def main(args): active_only = not args.all codes = dict((row[0], row[1]) for row in DBSession.execute("select ll.hid, l.pk from languoid as ll, language as l where ll.pk = l.pk and ll.hid is not null")) maxid = DBSession.execute( "select pk from languoid order by pk desc limit 1").fetchone()[0] gcs = {} lnames = {} for row in DBSession.execute("select pk, name from language"): lnames[row[0]] = row[1] # dict mapping branches (i.e. tuples of sub-family names) to dicts of H-languages families = OrderedDict() # dict mapping identifiers of H-languages to branches languages = OrderedDict() parse_families(data_file(args, 'lff.txt'), families, languages) # handle isolates / collapse families with exactly one leaf: isolate_names = {} collapsed_names = {} for key in families.keys(): if len(families[key]) == 1: if len(key) == 1: # isolate languages[families[key].keys()[0]][0] = None isolate_names[key[0]] = families[key].keys()[0] # map name to code else: languages[families[key].keys()[0]][0] = key[:-1] collapsed_names[key[-1]] = families[key].keys()[0] del families[key] # we also want to be able to lookup families by name names = {} for branch in families: name = branch[-1] if name in names: names[name].append(branch) else: names[name] = [branch] # now add the unclassifiabble, unattested, un-whatever parse_families(data_file(args, 'lof.txt'), families, languages) ncodes = {} languoids = [] for code in languages: if code not in codes: maxid += 1 ncodes[code] = maxid hnode, status, name, comment = languages[code] # we have to insert a new H-language! attrs = languoid( maxid, 'language', hid=code, id=glottocode(unicode(name), DBSession, gcs), name=name, hname=name, status=status, globalclassificationcomment=comment or None, ) print '++', attrs languoids.append(attrs) urnodes = {} rnodes = {} for family in families: #leafs = families[family] assert family[0] not in ['Speech Register', 'Spurious'] leafs = tuple(sorted(code for code in families[family].keys() if code in codes)) try: assert leafs except: print 'Family with only new languages!!' print family continue #raise if leafs in rnodes: # so we have already seen this exact set of leaves. # # special case: there may be additional "Unclassified something" nodes in # branch without any changes in the set of leafs ... try: assert [n for n in family if n.startswith('Unclassified')] except: print family print leafs # ... or the full leafset contains new languages assert [code for code in families[family[:-1]].keys() if code in ncodes] fset, rset = set(family), set(rnodes[leafs]) assert rset.issubset(fset) assert leafs not in urnodes urnodes[leafs] = family #if len(family) > rnodes[leafs]: # rnodes[leafs] = family else: rnodes[leafs] = family # # at this point rnodes is a consolidated mapping of sets of H-Languages to branches in # the family tree. # # for set comparisons we compute a list of actual sets of leafs as well leafsets = [set(t) for t in sorted(rnodes.keys(), key=lambda s: len(s))] todo = [] # dict mapping (id, name, level) tuples for gl languoids of level family to tuples of leafs glnodes = {} # # note: all languoids with level null have children, thus are not dialects! # sql = "select l.pk, l.name, ll.level, ll.father_pk from languoid as ll, language as l where ll.pk = l.pk and ll.level = 'family' or ll.level is null" if active_only: sql = "select l.pk, l.name, ll.level, ll.father_pk from languoid as ll, language as l where ll.pk = l.pk and ll.level = 'family' and l.active = true" for row in DBSession.execute(sql).fetchall(): leafs = [r[0] for r in DBSession.execute( "select distinct l.hid from treeclosuretable as t, languoid as l where t.child_pk = l.pk and t.parent_pk = %s and l.hid is not null and l.status != 'provisional'" % row[0])] if leafs: glnodes[(row[0], row[2], row[1], row[3])] = tuple(sorted(leafs)) else: # families without leafs will be marked as retired if row[1] in names and len(names[row[1]]) == 1: # unique family name, good enough for a match!? todo.append(Migration(row[0], None, pointer=names[row[1]][0])) else: todo.append(Migration(row[0], None)) # note: for legacy gl nodes, we map leaf-tuples to lists of matching nodes! rglnodes = {} for node, leafs in glnodes.items(): if leafs in rglnodes: rglnodes[leafs].append(node) else: rglnodes[leafs] = [node] # now we look for matches between old and new classification: for leafs, nodes in rglnodes.items(): assert leafs assert nodes todo.extend(match_nodes(leafs, nodes, rnodes, urnodes, leafsets, names)) # compile a mapping for exact matches: branch_to_pk = {} for m in todo: if m.hid: if m.hid in branch_to_pk: if branch_to_pk[m.hid] != m.pk: # compare names: if lnames[m.pk] == m.hid[-1]: print '#### type1' branch_to_pk[m.hid] = m.pk elif lnames[branch_to_pk[m.hid]] == m.hid[-1]: print '#### type2' pass else: print m.hid print m.hid[-1] print lnames[m.pk] print branch_to_pk[m.hid] print m.pk raise ValueError else: #assert m.hid not in branch_to_pk branch_to_pk[m.hid] = m.pk new = 0 for hnode in sorted(families.keys(), key=lambda b: (len(b), b)): # loop through branches breadth first to determine what's to be inserted if hnode not in branch_to_pk: t = tuple(sorted(families[hnode].keys())) if t in rglnodes: # the "Unclassified subfamily" special case from above: try: assert [n for n in hnode if n.startswith('Unclassified')] except: # or the "new language inserted higher up" case! assert [code for code in families[hnode[:-1]].keys() if code in ncodes] #print hnode #print t #raise # make sure, the existing glottolog family for the set of leafs is mapped # to some other node in the new classification: assert rglnodes[t][0][0] in [m.pk for m in todo if m.hid] maxid += 1 attrs = languoid( maxid, 'family', id=glottocode(unicode(hnode[-1]), DBSession, gcs), name=hnode[-1], hname=hnode[-1], ) branch_to_pk[hnode] = maxid lnames[maxid] = hnode[-1] if len(hnode) > 1: attrs['father_pk'] = branch_to_pk[tuple(list(hnode)[:-1])] assert attrs['father_pk'] print '++', attrs new += 1 languoids.append(attrs) # now on to the updates for families: matches, migrations, nomatches = 0, 0, 0 for m in todo: attrs = languoid(m.pk, 'family', name=lnames[m.pk]) if m.hid: #print '==', lnames[m.pk].encode('utf8'), '->', ', '.join(m.hid).encode('utf8') matches += 1 if len(m.hid) > 1: attrs['father_pk'] = branch_to_pk[tuple(list(m.hid)[:-1])] if getattr(m, 'rename', False): attrs['name'] = m.hid[-1] attrs['hname'] = m.hid[-1] else: attrs['active'] = False if getattr(m, 'pointer', False): print '~~', lnames[m.pk].encode('utf8'), '->', ', '.join(m.pointer).encode('utf8') migrations += 1 attrs['replacement'] = branch_to_pk[m.pointer] else: print '--', lnames[m.pk].encode('utf8'), '->' nomatches += 1 languoids.append(attrs) print matches, 'matches' print migrations, 'migrations' print nomatches, 'nomatches' print new, 'new nodes' risolate_names = dict(zip(isolate_names.values(), isolate_names.keys())) rcollapsed_names = dict(zip(collapsed_names.values(), collapsed_names.keys())) # and updates of father_pks for languages: for l in languages: hnode, status, name, comment = languages[l] id_ = codes.get(l, ncodes.get(l)) attrs = languoid(id_, 'language', status=status) if hnode: attrs['father_pk'] = branch_to_pk[hnode] attrs['globalclassificationcomment'] = comment or None # look for hnames! if l in risolate_names: attrs['hname'] = risolate_names[l] if l in rcollapsed_names: attrs['hname'] = rcollapsed_names[l] languoids.append(attrs) for row in DBSession.execute( "select l.pk, ll.hid, l.name from languoid as ll, language as l where ll.pk = l.pk and ll.hid like '%NOCODE_%'" ).fetchall(): if row[1] not in languages: # languoids with Harald's private code that are no longer in use attrs = languoid( row[0], 'language', status='retired', active=False, father_pk=None) languoids.append(attrs) with open(data_file(args, 'languoids.json'), 'w') as fp: json.dump(languoids, fp)
def main(args): stats = Counter(new=0, matches=0, migrations=0, nomatches=0) l, ll = Language.__table__.alias('l'), Languoid.__table__.alias('ll') gl_languoids = list( DBSession.execute( select([l, ll], use_labels=True).where(l.c.pk == ll.c.pk)).fetchall()) # we collect a list of changes which we will store in a JSON file. changes = [] hid_to_pk = { row['ll_hid']: row['l_pk'] for row in gl_languoids if row['ll_hid'] } max_languoid_pk = max(*[row['l_pk'] for row in gl_languoids]) new_glottocodes = {} pk_to_name = {row['l_pk']: row['l_name'] for row in gl_languoids} # dict mapping branches (i.e. tuples of sub-family names) to dicts of H-languages hh_families = OrderedDict() # dict mapping identifiers (i.e. hid) of H-languages to branches hh_languages = OrderedDict() parse_families(args.data_dir.joinpath('languoids', 'lff.txt'), hh_families, hh_languages) # handle isolates / collapse families with exactly one leaf: isolate_names = {} collapsed_names = {} for key in hh_families.keys(): if len(hh_families[key]) == 1: if len(key) == 1: # isolate hh_languages[hh_families[key].keys()[0]][0] = None isolate_names[key[0]] = hh_families[key].keys()[ 0] # map name to code else: hh_languages[hh_families[key].keys()[0]][0] = key[:-1] collapsed_names[key[-1]] = hh_families[key].keys()[0] del hh_families[key] # now add the unclassifiabble, unattested, un-whatever parse_families(args.data_dir.joinpath('languoids', 'lof.txt'), hh_families, hh_languages) # we also want to be able to lookup families by name fname_to_branches = defaultdict(list) for branch in hh_families: fname_to_branches[branch[-1]].append(branch) new_hid_to_pk = {} for code, (hnode, status, name) in hh_languages.items(): if code not in hid_to_pk: # we have to insert a new H-language! max_languoid_pk += 1 new_hid_to_pk[code] = max_languoid_pk if name in pk_to_name.values(): args.log.warn('new code {1} for existing name {0}'.format( name, code)) changes.append( languoid(max_languoid_pk, 'language', hid=code, id=glottocode(unicode(name), DBSession, new_glottocodes), name=name, hname=name, status=status)) stats.update(['new_languages']) duplicate_leafset_to_branch = {} leafset_to_branch = {} for family, langs in hh_families.items(): leafs = get_leafset(hid for hid in langs.keys() if hid in hid_to_pk) if not leafs: args.log.info('Family with only new languages: %s, %s' % (family, langs)) continue if leafs in leafset_to_branch: # so we have already seen this exact set of leaves. # # special case: there may be additional "Unclassified something" nodes in # branch without any changes in the set of leafs ... if not [n for n in family if n.startswith('Unclassified')]: # ... or the full leafset contains new languages assert [ hid for hid in hh_families[family[:-1]].keys() if hid in new_hid_to_pk ] fset, rset = set(family), set(leafset_to_branch[leafs]) assert rset.issubset(fset) assert leafs not in duplicate_leafset_to_branch duplicate_leafset_to_branch[leafs] = family else: leafset_to_branch[leafs] = family # # at this point leafset_to_branch is a consolidated mapping of sets of H-Languages # to branches in the new family tree. # # for set comparisons we compute a list of actual sets (not tuples) of leafs # ordered by length. leafsets = [ set(t) for t in sorted(leafset_to_branch.keys(), key=lambda s: len(s)) ] todo = [] gl_family_to_leafset = {} def select_leafs(pk): l, tc = Languoid.__table__.alias( 'l'), TreeClosureTable.__table__.alias('tc') return [ r['l_hid'] for r in DBSession.execute( select([l, tc], use_labels=True).where( and_(l.c.pk == tc.c.child_pk, l.c.hid != None, l.c.status != LanguoidStatus.provisional, tc.c.parent_pk == pk))) ] for row in gl_languoids: if row['ll_level'] == LanguoidLevel.family and row['l_active']: leafs = get_leafset(select_leafs(row['l_pk'])) assert leafs glnode = GLNode(row['l_pk'], row['l_name'], row['ll_level'].name, row['ll_father_pk'], row['l_jsondata'].get('hname')) gl_family_to_leafset[glnode] = leafs # note: for legacy gl nodes, we map leaf-tuples to lists of matching nodes! leafset_to_gl_family = defaultdict(list) for node, leafs in gl_family_to_leafset.items(): leafset_to_gl_family[leafs].append(node) # now we look for matches between old and new classification: for leafs, nodes in leafset_to_gl_family.items(): todo.extend( match_nodes(args, leafs, nodes, leafset_to_branch, duplicate_leafset_to_branch, leafsets, fname_to_branches)) # compile a mapping for exact matches: branch_to_pk = {} for m in todo: if m.hid: if m.hid in branch_to_pk: if branch_to_pk[m.hid] != m.pk: # compare names: if pk_to_name[m.pk] == m.hid[-1]: args.log.info('#### type1') branch_to_pk[m.hid] = m.pk elif pk_to_name[branch_to_pk[m.hid]] == m.hid[-1]: args.log.info('#### type2') else: raise ValueError else: branch_to_pk[m.hid] = m.pk for hnode in sorted(hh_families.keys(), key=lambda b: (len(b), b)): # loop through branches breadth first to determine what's to be inserted if hnode not in branch_to_pk: t = get_leafset(hh_families[hnode].keys()) if t in leafset_to_gl_family: # the "Unclassified subfamily" special case from above: if not [n for n in hnode if n.startswith('Unclassified')]: assert [ hid for hid in hh_families[hnode[:-1]].keys() if hid in new_hid_to_pk ] # make sure, the existing glottolog family for the set of leafs is mapped # to some other node in the new classification: assert leafset_to_gl_family[t][0].pk in [ m.pk for m in todo if m.hid ] max_languoid_pk += 1 branch_to_pk[hnode] = max_languoid_pk pk_to_name[max_languoid_pk] = hnode[-1] attrs = languoid( max_languoid_pk, 'family', id=glottocode(unicode(hnode[-1]), DBSession, new_glottocodes), name=hnode[-1], hname=hnode[-1], ) if len(hnode) > 1: attrs['father_pk'] = branch_to_pk[tuple(list(hnode)[:-1])] assert attrs['father_pk'] stats.update(['new']) changes.append(attrs) # now on to the updates for families: for m in todo: attrs = languoid(m.pk, 'family', name=pk_to_name[m.pk]) if m.hid: stats.update(['matches']) if len(m.hid) > 1: attrs['father_pk'] = branch_to_pk[tuple(list(m.hid)[:-1])] if getattr(m, 'rename', False): attrs['name'] = m.hid[-1] attrs['hname'] = m.hid[-1] else: attrs['active'] = False # mark the languoid as obsolete. if getattr(m, 'pointer', False): print '~~', m.pk, pk_to_name[m.pk].encode('utf8'), '->', \ ', '.join(m.pointer).encode('utf8') stats.update(['migrations']) attrs['replacement'] = branch_to_pk[m.pointer] else: stats.update(['nomatches']) changes.append(attrs) args.log.info('%s' % stats) risolate_names = dict(zip(isolate_names.values(), isolate_names.keys())) rcollapsed_names = dict( zip(collapsed_names.values(), collapsed_names.keys())) # and updates of father_pks for languages: for l, (hnode, status, name) in hh_languages.items(): id_ = hid_to_pk.get(l) if not id_: id_ = new_hid_to_pk.get(l) attrs = languoid(id_, 'language', status=status) else: attrs = languoid(id_, 'language', status=status) # In case of existing languoids, we don't change the active flag! del attrs['active'] if id_ in pk_to_name and name != pk_to_name[id_]: if slug(pk_to_name[id_]) == slug(name): attrs['name'] = name if hnode: attrs['father_pk'] = branch_to_pk[hnode] # look for hnames! if l in risolate_names: attrs['hname'] = risolate_names[l] if l in rcollapsed_names: attrs['hname'] = rcollapsed_names[l] changes.append(attrs) for row in gl_languoids: hid = row['ll_hid'] if hid and 'NOCODE' in hid and hid not in hh_languages: # languoids with Harald's private code that are no longer in use changes.append( languoid(row['l_pk'], 'language', status='retired', active=False, father_pk=None)) jsondump(changes, args.data_dir.joinpath('languoids', 'changes.json'), indent=4)