Python glottocode Examples

Programming Language: Python

Namespace/Package Name: glottolog3.lib.util

Method/Function: glottocode

Examples at hotexamples.com: 4

Python glottocode - 4 examples found. These are the top rated real world Python examples of glottolog3.lib.util.glottocode extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def main(args):
    active_only = not args.all
    coords = dict(
        (r[0], r[1:]) for r in dsv.rows(args.data_file('coordinates.tab')))
    codes = dict((row[0], row[1]) for row in DBSession.execute(
        "select ll.hid, l.pk from languoid as ll, language as l where ll.pk = l.pk and ll.hid is not null"
    ))

    maxid = DBSession.execute(
        "select pk from languoid order by pk desc limit 1").fetchone()[0]
    gcs = {}

    lnames = {}
    for row in DBSession.execute("select pk, name from language"):
        lnames[row[0]] = row[1]

    # dict mapping branches (i.e. tuples of sub-family names) to dicts of H-languages
    families = OrderedDict()

    # dict mapping identifiers of H-languages to branches
    languages = OrderedDict()

    parse_families(args.data_file('lff.txt'), families, languages)

    # handle isolates / collapse families with exactly one leaf:
    isolate_names = {}
    collapsed_names = {}
    for key in families.keys():
        if len(families[key]) == 1:
            if len(key) == 1:
                # isolate
                languages[families[key].keys()[0]][0] = None
                isolate_names[key[0]] = families[key].keys()[
                    0]  # map name to code
            else:
                languages[families[key].keys()[0]][0] = key[:-1]
                collapsed_names[key[-1]] = families[key].keys()[0]
            del families[key]

    # we also want to be able to lookup families by name
    names = {}
    for branch in families:
        name = branch[-1]
        if name in names:
            names[name].append(branch)
        else:
            names[name] = [branch]

    # now add the unclassifiabble, unattested, un-whatever
    parse_families(args.data_file('lof.txt'), families, languages)

    ncodes = {}
    languoids = []
    for code in languages:
        if code not in codes:
            maxid += 1
            ncodes[code] = maxid
            hnode, status, name, comment = languages[code]
            # we have to insert a new H-language!
            attrs = languoid(
                maxid,
                'language',
                hid=code,
                id=glottocode(unicode(name), DBSession, gcs),
                name=name,
                hname=name,
                status=status,
                globalclassificationcomment=comment or None,
            )
            print '++', attrs
            if coords.get(code):
                attrs['longitude'], attrs['latitude'] = map(
                    float, coords.get(code))
            languoids.append(attrs)

    urnodes = {}
    rnodes = {}
    for family in families:
        leafs = families[family]
        assert family[0] not in ['Speech Register', 'Spurious']
        leafs = tuple(
            sorted(code for code in families[family].keys() if code in codes))
        assert leafs
        if leafs in rnodes:
            # special case: there may be additional "Unclassified something" nodes in
            # branch without any changes in the set of leafs.
            assert [n for n in family if n.startswith('Unclassified')]
            fset, rset = set(family), set(rnodes[leafs])
            assert rset.issubset(fset)
            assert leafs not in urnodes
            urnodes[leafs] = family
            #if len(family) > rnodes[leafs]:
            #    rnodes[leafs] = family
        else:
            rnodes[leafs] = family

    #
    # at this point rnodes is a consolidated mapping of sets of H-Languages to branches in
    # the family tree.
    #

    # for set comparisons we compute a list of actual sets of leafs as well
    leafsets = [set(t) for t in sorted(rnodes.keys(), key=lambda s: len(s))]

    todo = []

    # dict mapping (id, name, level) tuples for gl languoids of level family to tuples of leafs
    glnodes = {}
    #
    # note: all languoids with level null have children, thus are not dialects!
    #
    sql = "select l.pk, l.name, ll.level, ll.father_pk from languoid as ll, language as l where ll.pk = l.pk and ll.level = 'family' or ll.level is null"
    if active_only:
        sql = "select l.pk, l.name, ll.level, ll.father_pk from languoid as ll, language as l where ll.pk = l.pk and ll.level = 'family' and l.active = true"

    for row in DBSession.execute(sql).fetchall():
        leafs = [
            r[0] for r in DBSession.execute(
                "select distinct l.hid from treeclosuretable as t, languoid as l where t.child_pk = l.pk and t.parent_pk = %s and l.hid is not null and l.status != 'provisional'"
                % row[0])
        ]
        if leafs:
            glnodes[(row[0], row[2], row[1], row[3])] = tuple(sorted(leafs))
        else:
            # families without leafs will be marked as retired
            if row[1] in names and len(names[row[1]]) == 1:
                # unique family name, good enough for a match!?
                todo.append(Migration(row[0], None, pointer=names[row[1]][0]))
            else:
                todo.append(Migration(row[0], None))

    # note: for legacy gl nodes, we map leaf-tuples to lists of matching nodes!
    rglnodes = {}
    for node, leafs in glnodes.items():
        if leafs in rglnodes:
            rglnodes[leafs].append(node)
        else:
            rglnodes[leafs] = [node]

    # now we look for matches between old and new classification:
    for leafs, nodes in rglnodes.items():
        assert leafs
        assert nodes
        todo.extend(match_nodes(leafs, nodes, rnodes, urnodes, leafsets,
                                names))

    # compile a mapping for exact matches:
    branch_to_pk = {}
    for m in todo:
        if m.hid:
            assert m.hid not in branch_to_pk
            branch_to_pk[m.hid] = m.pk

    new = 0
    for hnode in sorted(families.keys(), key=lambda b: (len(b), b)):
        # loop through branches breadth first to determine what's to be inserted
        if hnode not in branch_to_pk:
            t = tuple(sorted(families[hnode].keys()))
            if t in rglnodes:
                # the "Unclassified subfamily" special case from above:
                assert [n for n in hnode if n.startswith('Unclassified')]
                # make sure, the existing glottolog family for the set of leafs is mapped
                # to some other node in the new classification:
                assert rglnodes[t][0][0] in [m.pk for m in todo if m.hid]

            maxid += 1
            attrs = languoid(
                maxid,
                'family',
                id=glottocode(unicode(hnode[-1]), DBSession, gcs),
                name=hnode[-1],
                hname=hnode[-1],
            )
            branch_to_pk[hnode] = maxid
            lnames[maxid] = hnode[-1]
            if len(hnode) > 1:
                attrs['father_pk'] = branch_to_pk[tuple(list(hnode)[:-1])]
                assert attrs['father_pk']
            print '++', attrs
            new += 1
            languoids.append(attrs)

    # now on to the updates for families:
    matches, migrations, nomatches = 0, 0, 0
    for m in todo:
        attrs = languoid(m.pk, 'family', name=lnames[m.pk])
        if m.hid:
            #print '==', lnames[m.pk].encode('utf8'), '->', ', '.join(m.hid).encode('utf8')
            matches += 1

            if len(m.hid) > 1:
                attrs['father_pk'] = branch_to_pk[tuple(list(m.hid)[:-1])]
            if getattr(m, 'rename', False):
                attrs['name'] = m.hid[-1]
            attrs['hname'] = m.hid[-1]
        else:
            attrs['active'] = False
            if getattr(m, 'pointer', False):
                print '~~', lnames[m.pk].encode('utf8'), '->', ', '.join(
                    m.pointer).encode('utf8')
                migrations += 1

                attrs['replacement'] = branch_to_pk[m.pointer]
            else:
                print '--', lnames[m.pk].encode('utf8'), '->'
                nomatches += 1
        languoids.append(attrs)

    print matches, 'matches'
    print migrations, 'migrations'
    print nomatches, 'nomatches'
    print new, 'new nodes'

    risolate_names = dict(zip(isolate_names.values(), isolate_names.keys()))
    rcollapsed_names = dict(
        zip(collapsed_names.values(), collapsed_names.keys()))

    # and updates of father_pks for languages:
    for l in languages:
        hnode, status, name, comment = languages[l]
        id_ = codes.get(l, ncodes.get(l))
        attrs = languoid(id_, 'language', status=status)
        if hnode:
            attrs['father_pk'] = branch_to_pk[hnode]
        attrs['globalclassificationcomment'] = comment or None
        # look for hnames!
        if l in risolate_names:
            attrs['hname'] = risolate_names[l]
        if l in rcollapsed_names:
            attrs['hname'] = rcollapsed_names[l]
        languoids.append(attrs)

    for row in DBSession.execute(
            "select l.pk, ll.hid, l.name from languoid as ll, language as l where ll.pk = l.pk and ll.hid like '%NOCODE_%'"
    ).fetchall():
        if row[1] not in languages:
            # languoids with Harald's private code that are no longer in use
            attrs = languoid(row[0],
                             'language',
                             status='retired',
                             active=False,
                             father_pk=None)
            languoids.append(attrs)

    with open(args.data_file('languoids.json'), 'w') as fp:
        json.dump(languoids, fp)

Example #2

Show file

File: compute_tree_changes.py Project: pombredanne/glottolog3

def main(args):
    stats = Counter(new=0, matches=0, migrations=0, nomatches=0)
    l, ll = Language.__table__.alias("l"), Languoid.__table__.alias("ll")
    gl_languoids = list(DBSession.execute(select([l, ll], use_labels=True).where(l.c.pk == ll.c.pk)).fetchall())

    # we collect a list of changes which we will store in a JSON file.
    changes = []

    hid_to_pk = {row["ll_hid"]: row["l_pk"] for row in gl_languoids if row["ll_hid"]}
    max_languoid_pk = max(*[row["l_pk"] for row in gl_languoids])
    new_glottocodes = {}
    pk_to_name = {row["l_pk"]: row["l_name"] for row in gl_languoids}

    # dict mapping branches (i.e. tuples of sub-family names) to dicts of H-languages
    hh_families = OrderedDict()

    # dict mapping identifiers (i.e. hid) of H-languages to branches
    hh_languages = OrderedDict()

    parse_families(args.data_dir.joinpath("languoids", "lff.txt"), hh_families, hh_languages)

    # handle isolates / collapse families with exactly one leaf:
    isolate_names = {}
    collapsed_names = {}
    for key in hh_families.keys():
        if len(hh_families[key]) == 1:
            if len(key) == 1:
                # isolate
                hh_languages[hh_families[key].keys()[0]][0] = None
                isolate_names[key[0]] = hh_families[key].keys()[0]  # map name to code
            else:
                hh_languages[hh_families[key].keys()[0]][0] = key[:-1]
                collapsed_names[key[-1]] = hh_families[key].keys()[0]
            del hh_families[key]

    # now add the unclassifiabble, unattested, un-whatever
    parse_families(args.data_dir.joinpath("languoids", "lof.txt"), hh_families, hh_languages)

    # we also want to be able to lookup families by name
    fname_to_branches = defaultdict(list)
    for branch in hh_families:
        fname_to_branches[branch[-1]].append(branch)

    new_hid_to_pk = {}
    for code, (hnode, status, name) in hh_languages.items():
        if code not in hid_to_pk:
            # we have to insert a new H-language!
            max_languoid_pk += 1
            new_hid_to_pk[code] = max_languoid_pk

            if name in pk_to_name.values():
                args.log.warn("new code {1} for existing name {0}".format(name, code))
            changes.append(
                languoid(
                    max_languoid_pk,
                    "language",
                    hid=code,
                    id=glottocode(unicode(name), DBSession, new_glottocodes),
                    name=name,
                    hname=name,
                    status=status,
                )
            )
            stats.update(["new_languages"])

    duplicate_leafset_to_branch = {}
    leafset_to_branch = {}
    for family, langs in hh_families.items():
        leafs = get_leafset(hid for hid in langs.keys() if hid in hid_to_pk)
        if not leafs:
            args.log.info("Family with only new languages: %s, %s" % (family, langs))
            continue

        if leafs in leafset_to_branch:
            # so we have already seen this exact set of leaves.
            #
            # special case: there may be additional "Unclassified something" nodes in
            # branch without any changes in the set of leafs ...
            if not [n for n in family if n.startswith("Unclassified")]:
                # ... or the full leafset contains new languages
                assert [hid for hid in hh_families[family[:-1]].keys() if hid in new_hid_to_pk]
            fset, rset = set(family), set(leafset_to_branch[leafs])
            assert rset.issubset(fset)
            assert leafs not in duplicate_leafset_to_branch
            duplicate_leafset_to_branch[leafs] = family
        else:
            leafset_to_branch[leafs] = family

    #
    # at this point leafset_to_branch is a consolidated mapping of sets of H-Languages
    # to branches in the new family tree.
    #

    # for set comparisons we compute a list of actual sets (not tuples) of leafs
    # ordered by length.
    leafsets = [set(t) for t in sorted(leafset_to_branch.keys(), key=lambda s: len(s))]

    todo = []

    gl_family_to_leafset = {}

    def select_leafs(pk):
        l, tc = Languoid.__table__.alias("l"), TreeClosureTable.__table__.alias("tc")
        return [
            r["l_hid"]
            for r in DBSession.execute(
                select([l, tc], use_labels=True).where(
                    and_(
                        l.c.pk == tc.c.child_pk,
                        l.c.hid != None,
                        l.c.status != LanguoidStatus.provisional,
                        tc.c.parent_pk == pk,
                    )
                )
            )
        ]

    for row in gl_languoids:
        if row["ll_level"] == LanguoidLevel.family and row["l_active"]:
            leafs = get_leafset(select_leafs(row["l_pk"]))
            assert leafs
            glnode = GLNode(
                row["l_pk"], row["l_name"], row["ll_level"].name, row["ll_father_pk"], row["l_jsondata"].get("hname")
            )
            gl_family_to_leafset[glnode] = leafs

    # note: for legacy gl nodes, we map leaf-tuples to lists of matching nodes!
    leafset_to_gl_family = defaultdict(list)
    for node, leafs in gl_family_to_leafset.items():
        leafset_to_gl_family[leafs].append(node)

    # now we look for matches between old and new classification:
    for leafs, nodes in leafset_to_gl_family.items():
        todo.extend(
            match_nodes(args, leafs, nodes, leafset_to_branch, duplicate_leafset_to_branch, leafsets, fname_to_branches)
        )

    # compile a mapping for exact matches:
    branch_to_pk = {}
    for m in todo:
        if m.hid:
            if m.hid in branch_to_pk:
                if branch_to_pk[m.hid] != m.pk:
                    # compare names:
                    if pk_to_name[m.pk] == m.hid[-1]:
                        args.log.info("#### type1")
                        branch_to_pk[m.hid] = m.pk
                    elif pk_to_name[branch_to_pk[m.hid]] == m.hid[-1]:
                        args.log.info("#### type2")
                    else:
                        raise ValueError
            else:
                branch_to_pk[m.hid] = m.pk

    for hnode in sorted(hh_families.keys(), key=lambda b: (len(b), b)):
        # loop through branches breadth first to determine what's to be inserted
        if hnode not in branch_to_pk:
            t = get_leafset(hh_families[hnode].keys())
            if t in leafset_to_gl_family:
                # the "Unclassified subfamily" special case from above:
                if not [n for n in hnode if n.startswith("Unclassified")]:
                    assert [hid for hid in hh_families[hnode[:-1]].keys() if hid in new_hid_to_pk]
                # make sure, the existing glottolog family for the set of leafs is mapped
                # to some other node in the new classification:
                assert leafset_to_gl_family[t][0].pk in [m.pk for m in todo if m.hid]

            max_languoid_pk += 1
            branch_to_pk[hnode] = max_languoid_pk
            pk_to_name[max_languoid_pk] = hnode[-1]
            attrs = languoid(
                max_languoid_pk,
                "family",
                id=glottocode(unicode(hnode[-1]), DBSession, new_glottocodes),
                name=hnode[-1],
                hname=hnode[-1],
            )
            if len(hnode) > 1:
                attrs["father_pk"] = branch_to_pk[tuple(list(hnode)[:-1])]
                assert attrs["father_pk"]
            stats.update(["new"])
            changes.append(attrs)

    # now on to the updates for families:
    for m in todo:
        attrs = languoid(m.pk, "family", name=pk_to_name[m.pk])
        if m.hid:
            stats.update(["matches"])
            if len(m.hid) > 1:
                attrs["father_pk"] = branch_to_pk[tuple(list(m.hid)[:-1])]
            if getattr(m, "rename", False):
                attrs["name"] = m.hid[-1]
            attrs["hname"] = m.hid[-1]
        else:
            attrs["active"] = False  # mark the languoid as obsolete.
            if getattr(m, "pointer", False):
                print "~~", m.pk, pk_to_name[m.pk].encode("utf8"), "->", ", ".join(m.pointer).encode("utf8")
                stats.update(["migrations"])
                attrs["replacement"] = branch_to_pk[m.pointer]
            else:
                stats.update(["nomatches"])
        changes.append(attrs)

    args.log.info("%s" % stats)

    risolate_names = dict(zip(isolate_names.values(), isolate_names.keys()))
    rcollapsed_names = dict(zip(collapsed_names.values(), collapsed_names.keys()))

    # and updates of father_pks for languages:
    for l, (hnode, status, name) in hh_languages.items():
        id_ = hid_to_pk.get(l)
        if not id_:
            id_ = new_hid_to_pk.get(l)
            attrs = languoid(id_, "language", status=status)
        else:
            attrs = languoid(id_, "language", status=status)
            # In case of existing languoids, we don't change the active flag!
            del attrs["active"]
        if id_ in pk_to_name and name != pk_to_name[id_]:
            if slug(pk_to_name[id_]) == slug(name):
                attrs["name"] = name
        if hnode:
            attrs["father_pk"] = branch_to_pk[hnode]
        # look for hnames!
        if l in risolate_names:
            attrs["hname"] = risolate_names[l]
        if l in rcollapsed_names:
            attrs["hname"] = rcollapsed_names[l]
        changes.append(attrs)

    for row in gl_languoids:
        hid = row["ll_hid"]
        if hid and "NOCODE" in hid and hid not in hh_languages:
            # languoids with Harald's private code that are no longer in use
            changes.append(languoid(row["l_pk"], "language", status="retired", active=False, father_pk=None))

    jsondump(changes, args.data_dir.joinpath("languoids", "changes.json"), indent=4)

Example #3

Show file

File: compute_tree_changes.py Project: kublaj/glottolog3

def main(args):
    active_only = not args.all
    codes = dict((row[0], row[1]) for row in
                 DBSession.execute("select ll.hid, l.pk from languoid as ll, language as l where ll.pk = l.pk and ll.hid is not null"))

    maxid = DBSession.execute(
        "select pk from languoid order by pk desc limit 1").fetchone()[0]
    gcs = {}

    lnames = {}
    for row in DBSession.execute("select pk, name from language"):
        lnames[row[0]] = row[1]

    # dict mapping branches (i.e. tuples of sub-family names) to dicts of H-languages
    families = OrderedDict()

    # dict mapping identifiers of H-languages to branches
    languages = OrderedDict()

    parse_families(data_file(args, 'lff.txt'), families, languages)

    # handle isolates / collapse families with exactly one leaf:
    isolate_names = {}
    collapsed_names = {}
    for key in families.keys():
        if len(families[key]) == 1:
            if len(key) == 1:
                # isolate
                languages[families[key].keys()[0]][0] = None
                isolate_names[key[0]] = families[key].keys()[0]  # map name to code
            else:
                languages[families[key].keys()[0]][0] = key[:-1]
                collapsed_names[key[-1]] = families[key].keys()[0]
            del families[key]

    # we also want to be able to lookup families by name
    names = {}
    for branch in families:
        name = branch[-1]
        if name in names:
            names[name].append(branch)
        else:
            names[name] = [branch]

    # now add the unclassifiabble, unattested, un-whatever
    parse_families(data_file(args, 'lof.txt'), families, languages)

    ncodes = {}
    languoids = []
    for code in languages:
        if code not in codes:
            maxid += 1
            ncodes[code] = maxid
            hnode, status, name, comment = languages[code]
            # we have to insert a new H-language!
            attrs = languoid(
                maxid,
                'language',
                hid=code,
                id=glottocode(unicode(name), DBSession, gcs),
                name=name,
                hname=name,
                status=status,
                globalclassificationcomment=comment or None,
            )
            print '++', attrs
            languoids.append(attrs)

    urnodes = {}
    rnodes = {}
    for family in families:
        #leafs = families[family]
        assert family[0] not in ['Speech Register', 'Spurious']
        leafs = tuple(sorted(code for code in families[family].keys() if code in codes))
        try:
            assert leafs
        except:
            print 'Family with only new languages!!'
            print family
            continue
            #raise
        if leafs in rnodes:
            # so we have already seen this exact set of leaves.
            #
            # special case: there may be additional "Unclassified something" nodes in
            # branch without any changes in the set of leafs ...
            try:
                assert [n for n in family if n.startswith('Unclassified')]
            except:
                print family
                print leafs
                # ... or the full leafset contains new languages
                assert [code for code in families[family[:-1]].keys() if code in ncodes]
            fset, rset = set(family), set(rnodes[leafs])
            assert rset.issubset(fset)
            assert leafs not in urnodes
            urnodes[leafs] = family
            #if len(family) > rnodes[leafs]:
            #    rnodes[leafs] = family
        else:
            rnodes[leafs] = family

    #
    # at this point rnodes is a consolidated mapping of sets of H-Languages to branches in
    # the family tree.
    #

    # for set comparisons we compute a list of actual sets of leafs as well
    leafsets = [set(t) for t in sorted(rnodes.keys(), key=lambda s: len(s))]

    todo = []

    # dict mapping (id, name, level) tuples for gl languoids of level family to tuples of leafs
    glnodes = {}
    #
    # note: all languoids with level null have children, thus are not dialects!
    #
    sql = "select l.pk, l.name, ll.level, ll.father_pk from languoid as ll, language as l where ll.pk = l.pk and ll.level = 'family' or ll.level is null"
    if active_only:
        sql = "select l.pk, l.name, ll.level, ll.father_pk from languoid as ll, language as l where ll.pk = l.pk and ll.level = 'family' and l.active = true"

    for row in DBSession.execute(sql).fetchall():
        leafs = [r[0] for r in DBSession.execute(
            "select distinct l.hid from treeclosuretable as t, languoid as l where t.child_pk = l.pk and t.parent_pk = %s and l.hid is not null and l.status != 'provisional'"
            % row[0])]
        if leafs:
            glnodes[(row[0], row[2], row[1], row[3])] = tuple(sorted(leafs))
        else:
            # families without leafs will be marked as retired
            if row[1] in names and len(names[row[1]]) == 1:
                # unique family name, good enough for a match!?
                todo.append(Migration(row[0], None, pointer=names[row[1]][0]))
            else:
                todo.append(Migration(row[0], None))

    # note: for legacy gl nodes, we map leaf-tuples to lists of matching nodes!
    rglnodes = {}
    for node, leafs in glnodes.items():
        if leafs in rglnodes:
            rglnodes[leafs].append(node)
        else:
            rglnodes[leafs] = [node]

    # now we look for matches between old and new classification:
    for leafs, nodes in rglnodes.items():
        assert leafs
        assert nodes
        todo.extend(match_nodes(leafs, nodes, rnodes, urnodes, leafsets, names))

    # compile a mapping for exact matches:
    branch_to_pk = {}
    for m in todo:
        if m.hid:
            if m.hid in branch_to_pk:
                if branch_to_pk[m.hid] != m.pk:
                    # compare names:
                    if lnames[m.pk] == m.hid[-1]:
                        print '#### type1'
                        branch_to_pk[m.hid] = m.pk
                    elif lnames[branch_to_pk[m.hid]] == m.hid[-1]:
                        print '#### type2'
                        pass
                    else:
                        print m.hid
                        print m.hid[-1]
                        print lnames[m.pk]
                        print branch_to_pk[m.hid]
                        print m.pk
                        raise ValueError
            else:
                #assert m.hid not in branch_to_pk
                branch_to_pk[m.hid] = m.pk

    new = 0
    for hnode in sorted(families.keys(), key=lambda b: (len(b), b)):
        # loop through branches breadth first to determine what's to be inserted
        if hnode not in branch_to_pk:
            t = tuple(sorted(families[hnode].keys()))
            if t in rglnodes:
                # the "Unclassified subfamily" special case from above:
                try:
                    assert [n for n in hnode if n.startswith('Unclassified')]
                except:
                    # or the "new language inserted higher up" case!
                    assert [code for code in families[hnode[:-1]].keys() if code in ncodes]
                    #print hnode
                    #print t
                    #raise
                # make sure, the existing glottolog family for the set of leafs is mapped
                # to some other node in the new classification:
                assert rglnodes[t][0][0] in [m.pk for m in todo if m.hid]

            maxid += 1
            attrs = languoid(
                maxid,
                'family',
                id=glottocode(unicode(hnode[-1]), DBSession, gcs),
                name=hnode[-1],
                hname=hnode[-1],
            )
            branch_to_pk[hnode] = maxid
            lnames[maxid] = hnode[-1]
            if len(hnode) > 1:
                attrs['father_pk'] = branch_to_pk[tuple(list(hnode)[:-1])]
                assert attrs['father_pk']
            print '++', attrs
            new += 1
            languoids.append(attrs)

    # now on to the updates for families:
    matches, migrations, nomatches = 0, 0, 0
    for m in todo:
        attrs = languoid(m.pk, 'family', name=lnames[m.pk])
        if m.hid:
            #print '==', lnames[m.pk].encode('utf8'), '->', ', '.join(m.hid).encode('utf8')
            matches += 1

            if len(m.hid) > 1:
                attrs['father_pk'] = branch_to_pk[tuple(list(m.hid)[:-1])]
            if getattr(m, 'rename', False):
                attrs['name'] = m.hid[-1]
            attrs['hname'] = m.hid[-1]
        else:
            attrs['active'] = False
            if getattr(m, 'pointer', False):
                print '~~', lnames[m.pk].encode('utf8'), '->', ', '.join(m.pointer).encode('utf8')
                migrations += 1

                attrs['replacement'] = branch_to_pk[m.pointer]
            else:
                print '--', lnames[m.pk].encode('utf8'), '->'
                nomatches += 1
        languoids.append(attrs)

    print matches, 'matches'
    print migrations, 'migrations'
    print nomatches, 'nomatches'
    print new, 'new nodes'

    risolate_names = dict(zip(isolate_names.values(), isolate_names.keys()))
    rcollapsed_names = dict(zip(collapsed_names.values(), collapsed_names.keys()))

    # and updates of father_pks for languages:
    for l in languages:
        hnode, status, name, comment = languages[l]
        id_ = codes.get(l, ncodes.get(l))
        attrs = languoid(id_, 'language', status=status)
        if hnode:
            attrs['father_pk'] = branch_to_pk[hnode]
        attrs['globalclassificationcomment'] = comment or None
        # look for hnames!
        if l in risolate_names:
            attrs['hname'] = risolate_names[l]
        if l in rcollapsed_names:
            attrs['hname'] = rcollapsed_names[l]
        languoids.append(attrs)

    for row in DBSession.execute(
        "select l.pk, ll.hid, l.name from languoid as ll, language as l where ll.pk = l.pk and ll.hid like '%NOCODE_%'"
    ).fetchall():
        if row[1] not in languages:
            # languoids with Harald's private code that are no longer in use
            attrs = languoid(
                row[0], 'language', status='retired', active=False, father_pk=None)
            languoids.append(attrs)

    with open(data_file(args, 'languoids.json'), 'w') as fp:
        json.dump(languoids, fp)

Example #4

Show file

File: compute_tree_changes.py Project: cevmartinez/glottolog3

def main(args):
    stats = Counter(new=0, matches=0, migrations=0, nomatches=0)
    l, ll = Language.__table__.alias('l'), Languoid.__table__.alias('ll')
    gl_languoids = list(
        DBSession.execute(
            select([l, ll],
                   use_labels=True).where(l.c.pk == ll.c.pk)).fetchall())

    # we collect a list of changes which we will store in a JSON file.
    changes = []

    hid_to_pk = {
        row['ll_hid']: row['l_pk']
        for row in gl_languoids if row['ll_hid']
    }
    max_languoid_pk = max(*[row['l_pk'] for row in gl_languoids])
    new_glottocodes = {}
    pk_to_name = {row['l_pk']: row['l_name'] for row in gl_languoids}

    # dict mapping branches (i.e. tuples of sub-family names) to dicts of H-languages
    hh_families = OrderedDict()

    # dict mapping identifiers (i.e. hid) of H-languages to branches
    hh_languages = OrderedDict()

    parse_families(args.data_dir.joinpath('languoids', 'lff.txt'), hh_families,
                   hh_languages)

    # handle isolates / collapse families with exactly one leaf:
    isolate_names = {}
    collapsed_names = {}
    for key in hh_families.keys():
        if len(hh_families[key]) == 1:
            if len(key) == 1:
                # isolate
                hh_languages[hh_families[key].keys()[0]][0] = None
                isolate_names[key[0]] = hh_families[key].keys()[
                    0]  # map name to code
            else:
                hh_languages[hh_families[key].keys()[0]][0] = key[:-1]
                collapsed_names[key[-1]] = hh_families[key].keys()[0]
            del hh_families[key]

    # now add the unclassifiabble, unattested, un-whatever
    parse_families(args.data_dir.joinpath('languoids', 'lof.txt'), hh_families,
                   hh_languages)

    # we also want to be able to lookup families by name
    fname_to_branches = defaultdict(list)
    for branch in hh_families:
        fname_to_branches[branch[-1]].append(branch)

    new_hid_to_pk = {}
    for code, (hnode, status, name) in hh_languages.items():
        if code not in hid_to_pk:
            # we have to insert a new H-language!
            max_languoid_pk += 1
            new_hid_to_pk[code] = max_languoid_pk

            if name in pk_to_name.values():
                args.log.warn('new code {1} for existing name {0}'.format(
                    name, code))
            changes.append(
                languoid(max_languoid_pk,
                         'language',
                         hid=code,
                         id=glottocode(unicode(name), DBSession,
                                       new_glottocodes),
                         name=name,
                         hname=name,
                         status=status))
            stats.update(['new_languages'])

    duplicate_leafset_to_branch = {}
    leafset_to_branch = {}
    for family, langs in hh_families.items():
        leafs = get_leafset(hid for hid in langs.keys() if hid in hid_to_pk)
        if not leafs:
            args.log.info('Family with only new languages: %s, %s' %
                          (family, langs))
            continue

        if leafs in leafset_to_branch:
            # so we have already seen this exact set of leaves.
            #
            # special case: there may be additional "Unclassified something" nodes in
            # branch without any changes in the set of leafs ...
            if not [n for n in family if n.startswith('Unclassified')]:
                # ... or the full leafset contains new languages
                assert [
                    hid for hid in hh_families[family[:-1]].keys()
                    if hid in new_hid_to_pk
                ]
            fset, rset = set(family), set(leafset_to_branch[leafs])
            assert rset.issubset(fset)
            assert leafs not in duplicate_leafset_to_branch
            duplicate_leafset_to_branch[leafs] = family
        else:
            leafset_to_branch[leafs] = family

    #
    # at this point leafset_to_branch is a consolidated mapping of sets of H-Languages
    # to branches in the new family tree.
    #

    # for set comparisons we compute a list of actual sets (not tuples) of leafs
    # ordered by length.
    leafsets = [
        set(t) for t in sorted(leafset_to_branch.keys(), key=lambda s: len(s))
    ]

    todo = []

    gl_family_to_leafset = {}

    def select_leafs(pk):
        l, tc = Languoid.__table__.alias(
            'l'), TreeClosureTable.__table__.alias('tc')
        return [
            r['l_hid'] for r in DBSession.execute(
                select([l, tc], use_labels=True).where(
                    and_(l.c.pk == tc.c.child_pk, l.c.hid != None, l.c.status
                         != LanguoidStatus.provisional, tc.c.parent_pk == pk)))
        ]

    for row in gl_languoids:
        if row['ll_level'] == LanguoidLevel.family and row['l_active']:
            leafs = get_leafset(select_leafs(row['l_pk']))
            assert leafs
            glnode = GLNode(row['l_pk'], row['l_name'], row['ll_level'].name,
                            row['ll_father_pk'],
                            row['l_jsondata'].get('hname'))
            gl_family_to_leafset[glnode] = leafs

    # note: for legacy gl nodes, we map leaf-tuples to lists of matching nodes!
    leafset_to_gl_family = defaultdict(list)
    for node, leafs in gl_family_to_leafset.items():
        leafset_to_gl_family[leafs].append(node)

    # now we look for matches between old and new classification:
    for leafs, nodes in leafset_to_gl_family.items():
        todo.extend(
            match_nodes(args, leafs, nodes, leafset_to_branch,
                        duplicate_leafset_to_branch, leafsets,
                        fname_to_branches))

    # compile a mapping for exact matches:
    branch_to_pk = {}
    for m in todo:
        if m.hid:
            if m.hid in branch_to_pk:
                if branch_to_pk[m.hid] != m.pk:
                    # compare names:
                    if pk_to_name[m.pk] == m.hid[-1]:
                        args.log.info('#### type1')
                        branch_to_pk[m.hid] = m.pk
                    elif pk_to_name[branch_to_pk[m.hid]] == m.hid[-1]:
                        args.log.info('#### type2')
                    else:
                        raise ValueError
            else:
                branch_to_pk[m.hid] = m.pk

    for hnode in sorted(hh_families.keys(), key=lambda b: (len(b), b)):
        # loop through branches breadth first to determine what's to be inserted
        if hnode not in branch_to_pk:
            t = get_leafset(hh_families[hnode].keys())
            if t in leafset_to_gl_family:
                # the "Unclassified subfamily" special case from above:
                if not [n for n in hnode if n.startswith('Unclassified')]:
                    assert [
                        hid for hid in hh_families[hnode[:-1]].keys()
                        if hid in new_hid_to_pk
                    ]
                # make sure, the existing glottolog family for the set of leafs is mapped
                # to some other node in the new classification:
                assert leafset_to_gl_family[t][0].pk in [
                    m.pk for m in todo if m.hid
                ]

            max_languoid_pk += 1
            branch_to_pk[hnode] = max_languoid_pk
            pk_to_name[max_languoid_pk] = hnode[-1]
            attrs = languoid(
                max_languoid_pk,
                'family',
                id=glottocode(unicode(hnode[-1]), DBSession, new_glottocodes),
                name=hnode[-1],
                hname=hnode[-1],
            )
            if len(hnode) > 1:
                attrs['father_pk'] = branch_to_pk[tuple(list(hnode)[:-1])]
                assert attrs['father_pk']
            stats.update(['new'])
            changes.append(attrs)

    # now on to the updates for families:
    for m in todo:
        attrs = languoid(m.pk, 'family', name=pk_to_name[m.pk])
        if m.hid:
            stats.update(['matches'])
            if len(m.hid) > 1:
                attrs['father_pk'] = branch_to_pk[tuple(list(m.hid)[:-1])]
            if getattr(m, 'rename', False):
                attrs['name'] = m.hid[-1]
            attrs['hname'] = m.hid[-1]
        else:
            attrs['active'] = False  # mark the languoid as obsolete.
            if getattr(m, 'pointer', False):
                print '~~', m.pk, pk_to_name[m.pk].encode('utf8'), '->', \
                    ', '.join(m.pointer).encode('utf8')
                stats.update(['migrations'])
                attrs['replacement'] = branch_to_pk[m.pointer]
            else:
                stats.update(['nomatches'])
        changes.append(attrs)

    args.log.info('%s' % stats)

    risolate_names = dict(zip(isolate_names.values(), isolate_names.keys()))
    rcollapsed_names = dict(
        zip(collapsed_names.values(), collapsed_names.keys()))

    # and updates of father_pks for languages:
    for l, (hnode, status, name) in hh_languages.items():
        id_ = hid_to_pk.get(l)
        if not id_:
            id_ = new_hid_to_pk.get(l)
            attrs = languoid(id_, 'language', status=status)
        else:
            attrs = languoid(id_, 'language', status=status)
            # In case of existing languoids, we don't change the active flag!
            del attrs['active']
        if id_ in pk_to_name and name != pk_to_name[id_]:
            if slug(pk_to_name[id_]) == slug(name):
                attrs['name'] = name
        if hnode:
            attrs['father_pk'] = branch_to_pk[hnode]
        # look for hnames!
        if l in risolate_names:
            attrs['hname'] = risolate_names[l]
        if l in rcollapsed_names:
            attrs['hname'] = rcollapsed_names[l]
        changes.append(attrs)

    for row in gl_languoids:
        hid = row['ll_hid']
        if hid and 'NOCODE' in hid and hid not in hh_languages:
            # languoids with Harald's private code that are no longer in use
            changes.append(
                languoid(row['l_pk'],
                         'language',
                         status='retired',
                         active=False,
                         father_pk=None))

    jsondump(changes,
             args.data_dir.joinpath('languoids', 'changes.json'),
             indent=4)