Beispiel #1
0
def main(args):
    glottocodes = {}
    if getuser() == "robert":
        glottocodes = glottocodes_by_isocode("postgresql://robert@/glottolog3")

    data = Data()
    dataset = common.Dataset(id=autotyp.__name__, name="AUTOTYP", description="AUTOTYP", domain="autotyp.clld.org")
    DBSession.add(dataset)

    bib = Database.from_file(args.data_file("LenaBib.bib"), lowercase=True)

    for i, spec in enumerate(
        [
            ("bickel", "Balthasar Bickel", "University of Zurich"),
            ("nichols", "Johanna Nichols", "University of California, Berkeley"),
        ]
    ):
        contributor = data.add(common.Contributor, spec[0], id=spec[0], name=spec[1])
        DBSession.add(common.Editor(dataset=dataset, ord=i + 1, contributor=contributor))

    for l in rows(
        args.data_file("backbone_09Jan2014_directexport.tab"), newline="\r", encoding="macroman", namedtuples=True
    ):
        # LID	language	ISO639.3.2013	stock	continent	area	latitude	longitude
        if l.stock not in data["Stock"]:
            stock = data.add(models.Stock, l.stock, id=slug(l.stock), name=l.stock)
        else:
            stock = data["Stock"][l.stock]

        if l.continent not in data["Continent"]:
            continent = data.add(models.Continent, l.continent, id=slug(l.continent), name=l.continent)
        else:
            continent = data["Continent"][l.continent]

        if l.area not in data["Area"]:
            area = data.add(models.Area, l.area, id=slug(l.area), name=l.area, continent=continent)
        else:
            area = data["Area"][l.area]

        lang = data.add(
            models.Languoid,
            l.LID,
            id=l.LID,
            name=l.language,
            latitude=coord(l.latitude),
            longitude=coord(l.longitude),
            stock=stock,
            area=area,
        )
        add_language_codes(data, lang, l.ISO639_3_2013, glottocodes=glottocodes)

    loader.case_alignment(args, data, bib)
    loader.inclusive_excusive(args, data, bib)
Beispiel #2
0
def main(args):
    citations.main(args)
    data = Data()

    pairs = {}
    languages = {}

    coords = {}
    for lang in dsv.rows(
        args.data_file('MB_Map_Data_Aug13WLabels'),
        namedtuples=True,
        newline='\n',
        encoding='latin1'
    ):
        coords[slug(lang.Label.split('<')[0].strip())] = (
            float(lang.y), float(lang.x))

    xls = xlrd.open_workbook(args.data_file('MB_BoCatSum_AFBO.xlsx'))
    matrix = xls.sheet_by_name('MB_BoCatSum_AFBO.txt')
    md = "area\trecipient language iso\trecipient language genus\tdonor language iso\tdonor language genus".split('\t')

    fields = []
    params = []
    for i in range(matrix.ncols):
        colname = xlrd.colname(i)
        if len(colname) == 2 and colname > 'BE':
            break
        colval = matrix.cell(0, i).value.strip()
        if (len(colname) == 1 and colname > 'G') or (len(colname) == 2 and colname < 'AY'):
            params.append(colval)
            fields.append(colval)
        else:
            fields.append(colval.lower())

    for f in fields:
        if fields.count(f) > 1:
            print(f)

    assert len(fields) == len(set(fields))

    for j in range(1, matrix.nrows):
        values = dict(zip(fields, [matrix.cell(j, i).value for i in range(matrix.ncols)]))
        try:
            id_ = int(values['perm.id'])
        except:
            continue

        pairs[id_] = values
        for type_ in ['recipient', 'donor']:
            languages[values[type_ + ' language'].strip()] = {
                'macroarea': values['area']}
            for md in ['iso', 'genus']:
                languages[values[type_ + ' language'].strip()][md] \
                    = values['%s language %s' % (type_, md)]

    for name in COORDS:
        assert name in languages

    sources = {}
    with open(args.data_file('MB_Case_List_with_links.html')) as fp:
        worddoc = fp.read()
        for m in re.finditer('\"__(?P<recid>[^_]+)__\"', worddoc):
            sources[m.group('recid').decode('utf8')] = 1
        soup = bs(worddoc)

    doc = {}
    cols = []
    table = soup.find('table')
    for tr in table.children:
        if tr.name != 'tr':
            continue
        tds = filter(lambda n: n.name == 'td', tr.children)
        if not cols:
            cols = map(text, tds)
        else:
            values = dict(zip(cols, tds))
        try:
            id_ = int(text(values['perm.id']))
            doc[id_] = values
            if id_ in pairs:
                assert doc['Recipient lg.'] == pairs[id_][1]['recipient language']
                assert doc['Don'] == pairs[id_][1]['donor language']
        except:
            continue

    dataset = common.Dataset(
        id='afbo',
        name="AfBo: A world-wide survey of affix borrowing",
        contact="*****@*****.**",
        domain="afbo.info",
        license='http://creativecommons.org/licenses/by/3.0/',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 3.0 Unported License'})

    DBSession.add(dataset)
    for i, spec in enumerate([('seifart', "Frank Seifart")]):
        DBSession.add(common.Editor(
            dataset=dataset,
            ord=i + 1,
            contributor=common.Contributor(id=spec[0], name=spec[1])))

    contrib = data.add(common.Contribution, 'afbo', name="AfBo", id="afbo")

    iso_map = {
        ('ron', 'Meglenite Romanian'): ('ruq', None),
        ('fra', 'Norman French'): ('xno', None),
        ('tur', 'Turkic'): (None, 'turk1311'),
        ('xuu', 'Kxoe languages'): (None, 'khoe1241'),
        ('zoc', 'Zoquean languages'): (None, 'zoqu1261'),
        ('tzm', 'Moroccan Berber languages'): (None, 'atla1275'),
        ('cvn', 'Quechua'): ('qvn', None),
        ('rop', 'Gurindji Kriol'): (None, 'guri1249'),
        ('ita', 'Sicilian Italian'): ('scn', None),
        ('srp', 'Croatian'): ('hrv', None),
        ('eme', 'Wayampi‑Emerillon‑Zo’é'): (None, 'waya1271'),
        ('ale', 'Copper Island Aleut'): ('mud', None),
        ('car', 'intermediate Proto‑Carib'): (None, 'cari1283'),
        ('ell', 'Cappadocian Greek'): ('cpg', None),
        ('eng', 'Middle English'): ('enm', None),
        ('als', 'Arvanitic Albanian'): ('aat', None),
        ('nys', 'Northern Nyungic'): (None, 'dese1234'),
        ('ron', 'Istro‑Romanian'): ('ruo', None),
        ('chf', 'Cho’ol'): ('ctu', None),
        ('tuo', 'Eastern Tucanoan languages'): (None, 'east2698'),
        ('ceb', 'Visayan'): (None, 'bisa1268'),
        ('por', 'Sri Lanka Portuguese'): (None, 'mala1544'),
        ('brx', 'Tibeto-Burman languages'): (None, 'brah1260'),
    }

    with open('name_conflicts.tab', 'w') as fp:
        fp.write('iso\tafbo\tglottolog\tproposed iso\n')
        for i, name in enumerate(languages.keys()):
            md = languages[name]
            iso = md.pop('iso')
            if iso == 'cvn' and name == 'Quechua':
                iso = 'qvn'
            kw = dict(name=name, id=str(i+1), jsondata=md)
            if name in COORDS:
                kw['latitude'], kw['longitude'] = COORDS[name]
            elif slug(name) in coords:
                kw['latitude'], kw['longitude'] = coords[slug(name)]
            elif glottocoords.get(iso):
                kw['latitude'], kw['longitude'] = glottocoords[iso]

            if glottonames.get(iso) and slug(glottonames.get(iso)) != slug(name):
                fp.write(('%s\t%s\t%s\t%s\n' % (
                    iso, name, glottonames.get(iso), rglottonames.get(slug(name), ''))).encode('utf8'))

            if name == 'Meglenite Romanian':
                kw['name'] = 'Megleno Romanian'
            if not 'latitude' in kw:
                print(name)
            l = data.add(common.Language, name, **kw)

            iso, gc = iso_map.get((iso, name), (iso, None))

            for code, type_ in [
                (iso, common.IdentifierType.iso),
                (gc or glottocodes.get(iso), common.IdentifierType.glottolog)
            ]:
                if code:
                    identifier = data.add(
                        common.Identifier, code, id=code, name=code, type=type_.value)
                    data.add(
                        common.LanguageIdentifier, '%s-%s' % (code, l.id),
                        identifier=identifier, language=l)

    include = sources.keys() + [
        'myersscottoncontact2002', 'myersscottonlanguage2007',
        'meakinsborrowing2011', 'seifartprinciple2012',
    ]
    refdb = bibtex.Database.from_file(args.data_file('FSeifartZoteroLibrary14Nov2013.bib'))
    for rec in refdb:
        if slug(rec.id) in include:
            data.add(common.Source, slug(rec.id), _obj=bibtex2source(rec))

    for i, name in enumerate(params):
        data.add(models.AffixFunction, name, id=str(i + 1), name=name)

    for id_, vd in pairs.items():
        assert id_ in doc

        donor = data['Language'][vd['donor language'].strip()]
        recipient = data['Language'][vd['recipient language'].strip()]

        p = data.add(
            models.Pair,
            id_,
            id=str(id_),
            name=vd['pairs'].replace('Meglenite', 'Megleno'),
            area=recipient.jsondata['macroarea'],
            description=unicode(doc[id_]['comment']).replace('<h1', '<p').replace('</h1>', '</p>').replace('Meglenite', 'Megleno'),
            reliability=vd['reliability'],
            int_reliability=['high', 'mid', 'low'].index(vd['reliability']),
            count_interrel=int(vd[u'number of interrelated affixes']),
            count_borrowed=int(vd['number of borrowed affixes']),
            donor=donor,
            recipient=recipient)
        DBSession.flush()

        for i, param in enumerate(params):
            param_id = i + 1
            value = vd[param]
            if value != '':
                vsid = '%s-%s' % (recipient.id, param_id)
                if vsid in data['ValueSet']:
                    vs = data['ValueSet'][vsid]
                else:
                    vs = data.add(
                        common.ValueSet, vsid,
                        id=vsid,
                        parameter=data['AffixFunction'][param],
                        language=recipient,
                        contribution=contrib)
                data.add(
                    models.waabValue,
                    '%s-%s' % (id_, param_id),
                    id='%s-%s' % (id_, param_id),
                    pair=p,
                    name='%s' % int(value),
                    numeric=int(value),
                    description='%s' % p,
                    valueset=vs)
Beispiel #3
0
def main(args):
    active_only = not args.all
    coords = dict(
        (r[0], r[1:]) for r in dsv.rows(args.data_file('coordinates.tab')))
    codes = dict((row[0], row[1]) for row in DBSession.execute(
        "select ll.hid, l.pk from languoid as ll, language as l where ll.pk = l.pk and ll.hid is not null"
    ))

    maxid = DBSession.execute(
        "select pk from languoid order by pk desc limit 1").fetchone()[0]
    gcs = {}

    lnames = {}
    for row in DBSession.execute("select pk, name from language"):
        lnames[row[0]] = row[1]

    # dict mapping branches (i.e. tuples of sub-family names) to dicts of H-languages
    families = OrderedDict()

    # dict mapping identifiers of H-languages to branches
    languages = OrderedDict()

    parse_families(args.data_file('lff.txt'), families, languages)

    # handle isolates / collapse families with exactly one leaf:
    isolate_names = {}
    collapsed_names = {}
    for key in families.keys():
        if len(families[key]) == 1:
            if len(key) == 1:
                # isolate
                languages[families[key].keys()[0]][0] = None
                isolate_names[key[0]] = families[key].keys()[
                    0]  # map name to code
            else:
                languages[families[key].keys()[0]][0] = key[:-1]
                collapsed_names[key[-1]] = families[key].keys()[0]
            del families[key]

    # we also want to be able to lookup families by name
    names = {}
    for branch in families:
        name = branch[-1]
        if name in names:
            names[name].append(branch)
        else:
            names[name] = [branch]

    # now add the unclassifiabble, unattested, un-whatever
    parse_families(args.data_file('lof.txt'), families, languages)

    ncodes = {}
    languoids = []
    for code in languages:
        if code not in codes:
            maxid += 1
            ncodes[code] = maxid
            hnode, status, name, comment = languages[code]
            # we have to insert a new H-language!
            attrs = languoid(
                maxid,
                'language',
                hid=code,
                id=glottocode(unicode(name), DBSession, gcs),
                name=name,
                hname=name,
                status=status,
                globalclassificationcomment=comment or None,
            )
            print '++', attrs
            if coords.get(code):
                attrs['longitude'], attrs['latitude'] = map(
                    float, coords.get(code))
            languoids.append(attrs)

    urnodes = {}
    rnodes = {}
    for family in families:
        leafs = families[family]
        assert family[0] not in ['Speech Register', 'Spurious']
        leafs = tuple(
            sorted(code for code in families[family].keys() if code in codes))
        assert leafs
        if leafs in rnodes:
            # special case: there may be additional "Unclassified something" nodes in
            # branch without any changes in the set of leafs.
            assert [n for n in family if n.startswith('Unclassified')]
            fset, rset = set(family), set(rnodes[leafs])
            assert rset.issubset(fset)
            assert leafs not in urnodes
            urnodes[leafs] = family
            #if len(family) > rnodes[leafs]:
            #    rnodes[leafs] = family
        else:
            rnodes[leafs] = family

    #
    # at this point rnodes is a consolidated mapping of sets of H-Languages to branches in
    # the family tree.
    #

    # for set comparisons we compute a list of actual sets of leafs as well
    leafsets = [set(t) for t in sorted(rnodes.keys(), key=lambda s: len(s))]

    todo = []

    # dict mapping (id, name, level) tuples for gl languoids of level family to tuples of leafs
    glnodes = {}
    #
    # note: all languoids with level null have children, thus are not dialects!
    #
    sql = "select l.pk, l.name, ll.level, ll.father_pk from languoid as ll, language as l where ll.pk = l.pk and ll.level = 'family' or ll.level is null"
    if active_only:
        sql = "select l.pk, l.name, ll.level, ll.father_pk from languoid as ll, language as l where ll.pk = l.pk and ll.level = 'family' and l.active = true"

    for row in DBSession.execute(sql).fetchall():
        leafs = [
            r[0] for r in DBSession.execute(
                "select distinct l.hid from treeclosuretable as t, languoid as l where t.child_pk = l.pk and t.parent_pk = %s and l.hid is not null and l.status != 'provisional'"
                % row[0])
        ]
        if leafs:
            glnodes[(row[0], row[2], row[1], row[3])] = tuple(sorted(leafs))
        else:
            # families without leafs will be marked as retired
            if row[1] in names and len(names[row[1]]) == 1:
                # unique family name, good enough for a match!?
                todo.append(Migration(row[0], None, pointer=names[row[1]][0]))
            else:
                todo.append(Migration(row[0], None))

    # note: for legacy gl nodes, we map leaf-tuples to lists of matching nodes!
    rglnodes = {}
    for node, leafs in glnodes.items():
        if leafs in rglnodes:
            rglnodes[leafs].append(node)
        else:
            rglnodes[leafs] = [node]

    # now we look for matches between old and new classification:
    for leafs, nodes in rglnodes.items():
        assert leafs
        assert nodes
        todo.extend(match_nodes(leafs, nodes, rnodes, urnodes, leafsets,
                                names))

    # compile a mapping for exact matches:
    branch_to_pk = {}
    for m in todo:
        if m.hid:
            assert m.hid not in branch_to_pk
            branch_to_pk[m.hid] = m.pk

    new = 0
    for hnode in sorted(families.keys(), key=lambda b: (len(b), b)):
        # loop through branches breadth first to determine what's to be inserted
        if hnode not in branch_to_pk:
            t = tuple(sorted(families[hnode].keys()))
            if t in rglnodes:
                # the "Unclassified subfamily" special case from above:
                assert [n for n in hnode if n.startswith('Unclassified')]
                # make sure, the existing glottolog family for the set of leafs is mapped
                # to some other node in the new classification:
                assert rglnodes[t][0][0] in [m.pk for m in todo if m.hid]

            maxid += 1
            attrs = languoid(
                maxid,
                'family',
                id=glottocode(unicode(hnode[-1]), DBSession, gcs),
                name=hnode[-1],
                hname=hnode[-1],
            )
            branch_to_pk[hnode] = maxid
            lnames[maxid] = hnode[-1]
            if len(hnode) > 1:
                attrs['father_pk'] = branch_to_pk[tuple(list(hnode)[:-1])]
                assert attrs['father_pk']
            print '++', attrs
            new += 1
            languoids.append(attrs)

    # now on to the updates for families:
    matches, migrations, nomatches = 0, 0, 0
    for m in todo:
        attrs = languoid(m.pk, 'family', name=lnames[m.pk])
        if m.hid:
            #print '==', lnames[m.pk].encode('utf8'), '->', ', '.join(m.hid).encode('utf8')
            matches += 1

            if len(m.hid) > 1:
                attrs['father_pk'] = branch_to_pk[tuple(list(m.hid)[:-1])]
            if getattr(m, 'rename', False):
                attrs['name'] = m.hid[-1]
            attrs['hname'] = m.hid[-1]
        else:
            attrs['active'] = False
            if getattr(m, 'pointer', False):
                print '~~', lnames[m.pk].encode('utf8'), '->', ', '.join(
                    m.pointer).encode('utf8')
                migrations += 1

                attrs['replacement'] = branch_to_pk[m.pointer]
            else:
                print '--', lnames[m.pk].encode('utf8'), '->'
                nomatches += 1
        languoids.append(attrs)

    print matches, 'matches'
    print migrations, 'migrations'
    print nomatches, 'nomatches'
    print new, 'new nodes'

    risolate_names = dict(zip(isolate_names.values(), isolate_names.keys()))
    rcollapsed_names = dict(
        zip(collapsed_names.values(), collapsed_names.keys()))

    # and updates of father_pks for languages:
    for l in languages:
        hnode, status, name, comment = languages[l]
        id_ = codes.get(l, ncodes.get(l))
        attrs = languoid(id_, 'language', status=status)
        if hnode:
            attrs['father_pk'] = branch_to_pk[hnode]
        attrs['globalclassificationcomment'] = comment or None
        # look for hnames!
        if l in risolate_names:
            attrs['hname'] = risolate_names[l]
        if l in rcollapsed_names:
            attrs['hname'] = rcollapsed_names[l]
        languoids.append(attrs)

    for row in DBSession.execute(
            "select l.pk, ll.hid, l.name from languoid as ll, language as l where ll.pk = l.pk and ll.hid like '%NOCODE_%'"
    ).fetchall():
        if row[1] not in languages:
            # languoids with Harald's private code that are no longer in use
            attrs = languoid(row[0],
                             'language',
                             status='retired',
                             active=False,
                             father_pk=None)
            languoids.append(attrs)

    with open(args.data_file('languoids.json'), 'w') as fp:
        json.dump(languoids, fp)
def main(args):
    active_only = not args.all
    coords = dict((r[0], r[1:]) for r in dsv.rows(args.data_file('coordinates.tab')))
    codes = dict((row[0], row[1]) for row in
                 DBSession.execute("select ll.hid, l.pk from languoid as ll, language as l where ll.pk = l.pk and ll.hid is not null"))

    maxid = DBSession.execute(
        "select pk from languoid order by pk desc limit 1").fetchone()[0]
    gcs = {}

    lnames = {}
    for row in DBSession.execute("select pk, name from language"):
        lnames[row[0]] = row[1]

    # dict mapping branches (i.e. tuples of sub-family names) to dicts of H-languages
    families = OrderedDict()

    # dict mapping identifiers of H-languages to branches
    languages = OrderedDict()

    parse_families(args.data_file('lff.txt'), families, languages)

    # handle isolates / collapse families with exactly one leaf:
    isolate_names = {}
    collapsed_names = {}
    for key in families.keys():
        if len(families[key]) == 1:
            if len(key) == 1:
                # isolate
                languages[families[key].keys()[0]][0] = None
                isolate_names[key[0]] = families[key].keys()[0]  # map name to code
            else:
                languages[families[key].keys()[0]][0] = key[:-1]
                collapsed_names[key[-1]] = families[key].keys()[0]
            del families[key]

    # we also want to be able to lookup families by name
    names = {}
    for branch in families:
        name = branch[-1]
        if name in names:
            names[name].append(branch)
        else:
            names[name] = [branch]

    # now add the unclassifiabble, unattested, un-whatever
    parse_families(args.data_file('lof.txt'), families, languages)

    ncodes = {}
    languoids = []
    for code in languages:
        if code not in codes:
            maxid += 1
            ncodes[code] = maxid
            hnode, status, name, comment = languages[code]
            # we have to insert a new H-language!
            attrs = languoid(
                maxid,
                'language',
                hid=code,
                id=glottocode(unicode(name), DBSession, gcs),
                name=name,
                hname=name,
                status=status,
                globalclassificationcomment=comment or None,
            )
            print '++', attrs
            if coords.get(code):
                attrs['longitude'], attrs['latitude'] = map(float, coords.get(code))
            languoids.append(attrs)

    urnodes = {}
    rnodes = {}
    for family in families:
        leafs = families[family]
        assert family[0] not in ['Speech Register', 'Spurious']
        leafs = tuple(sorted(code for code in families[family].keys() if code in codes))
        assert leafs
        if leafs in rnodes:
            # special case: there may be additional "Unclassified something" nodes in
            # branch without any changes in the set of leafs.
            assert [n for n in family if n.startswith('Unclassified')]
            fset, rset = set(family), set(rnodes[leafs])
            assert rset.issubset(fset)
            assert leafs not in urnodes
            urnodes[leafs] = family
            #if len(family) > rnodes[leafs]:
            #    rnodes[leafs] = family
        else:
            rnodes[leafs] = family

    #
    # at this point rnodes is a consolidated mapping of sets of H-Languages to branches in
    # the family tree.
    #

    # for set comparisons we compute a list of actual sets of leafs as well
    leafsets = [set(t) for t in sorted(rnodes.keys(), key=lambda s: len(s))]

    todo = []

    # dict mapping (id, name, level) tuples for gl languoids of level family to tuples of leafs
    glnodes = {}
    #
    # note: all languoids with level null have children, thus are not dialects!
    #
    sql = "select l.pk, l.name, ll.level, ll.father_pk from languoid as ll, language as l where ll.pk = l.pk and ll.level = 'family' or ll.level is null"
    if active_only:
        sql = "select l.pk, l.name, ll.level, ll.father_pk from languoid as ll, language as l where ll.pk = l.pk and ll.level = 'family' and l.active = true"

    for row in DBSession.execute(sql).fetchall():
        leafs = [r[0] for r in DBSession.execute(
            "select distinct l.hid from treeclosuretable as t, languoid as l where t.child_pk = l.pk and t.parent_pk = %s and l.hid is not null and l.status != 'provisional'"
            % row[0])]
        if leafs:
            glnodes[(row[0], row[2], row[1], row[3])] = tuple(sorted(leafs))
        else:
            # families without leafs will be marked as retired
            if row[1] in names and len(names[row[1]]) == 1:
                # unique family name, good enough for a match!?
                todo.append(Migration(row[0], None, pointer=names[row[1]][0]))
            else:
                todo.append(Migration(row[0], None))

    # note: for legacy gl nodes, we map leaf-tuples to lists of matching nodes!
    rglnodes = {}
    for node, leafs in glnodes.items():
        if leafs in rglnodes:
            rglnodes[leafs].append(node)
        else:
            rglnodes[leafs] = [node]

    # now we look for matches between old and new classification:
    for leafs, nodes in rglnodes.items():
        assert leafs
        assert nodes
        todo.extend(match_nodes(leafs, nodes, rnodes, urnodes, leafsets, names))

    # compile a mapping for exact matches:
    branch_to_pk = {}
    for m in todo:
        if m.hid:
            assert m.hid not in branch_to_pk
            branch_to_pk[m.hid] = m.pk

    new = 0
    for hnode in sorted(families.keys(), key=lambda b: (len(b), b)):
        # loop through branches breadth first to determine what's to be inserted
        if hnode not in branch_to_pk:
            t = tuple(sorted(families[hnode].keys()))
            if t in rglnodes:
                # the "Unclassified subfamily" special case from above:
                assert [n for n in hnode if n.startswith('Unclassified')]
                # make sure, the existing glottolog family for the set of leafs is mapped
                # to some other node in the new classification:
                assert rglnodes[t][0][0] in [m.pk for m in todo if m.hid]

            maxid += 1
            attrs = languoid(
                maxid,
                'family',
                id=glottocode(unicode(hnode[-1]), DBSession, gcs),
                name=hnode[-1],
                hname=hnode[-1],
            )
            branch_to_pk[hnode] = maxid
            lnames[maxid] = hnode[-1]
            if len(hnode) > 1:
                attrs['father_pk'] = branch_to_pk[tuple(list(hnode)[:-1])]
                assert attrs['father_pk']
            print '++', attrs
            new += 1
            languoids.append(attrs)

    # now on to the updates for families:
    matches, migrations, nomatches = 0, 0, 0
    for m in todo:
        attrs = languoid(m.pk, 'family', name=lnames[m.pk])
        if m.hid:
            #print '==', lnames[m.pk].encode('utf8'), '->', ', '.join(m.hid).encode('utf8')
            matches += 1

            if len(m.hid) > 1:
                attrs['father_pk'] = branch_to_pk[tuple(list(m.hid)[:-1])]
            if getattr(m, 'rename', False):
                attrs['name'] = m.hid[-1]
            attrs['hname'] = m.hid[-1]
        else:
            attrs['active'] = False
            if getattr(m, 'pointer', False):
                print '~~', lnames[m.pk].encode('utf8'), '->', ', '.join(m.pointer).encode('utf8')
                migrations += 1

                attrs['replacement'] = branch_to_pk[m.pointer]
            else:
                print '--', lnames[m.pk].encode('utf8'), '->'
                nomatches += 1
        languoids.append(attrs)

    print matches, 'matches'
    print migrations, 'migrations'
    print nomatches, 'nomatches'
    print new, 'new nodes'

    risolate_names = dict(zip(isolate_names.values(), isolate_names.keys()))
    rcollapsed_names = dict(zip(collapsed_names.values(), collapsed_names.keys()))

    # and updates of father_pks for languages:
    for l in languages:
        hnode, status, name, comment = languages[l]
        id_ = codes.get(l, ncodes.get(l))
        attrs = languoid(id_, 'language', status=status)
        if hnode:
            attrs['father_pk'] = branch_to_pk[hnode]
        attrs['globalclassificationcomment'] = comment or None
        # look for hnames!
        if l in risolate_names:
            attrs['hname'] = risolate_names[l]
        if l in rcollapsed_names:
            attrs['hname'] = rcollapsed_names[l]
        languoids.append(attrs)

    for row in DBSession.execute(
        "select l.pk, ll.hid, l.name from languoid as ll, language as l where ll.pk = l.pk and ll.hid like '%NOCODE_%'"
    ).fetchall():
        if row[1] not in languages:
            # languoids with Harald's private code that are no longer in use
            attrs = languoid(
                row[0], 'language', status='retired', active=False, father_pk=None)
            languoids.append(attrs)

    with open(args.data_file('languoids.json'), 'w') as fp:
        json.dump(languoids, fp)
Beispiel #5
0
def inclusive_excusive(args, data, bib):
    """
    Incl	Inclusive/exclusive distinction. 1 = present, 0 = absent.
    Belh	Belhare-type inclusive/exclusive distinction. 1 = present, 0 = absent. NA = no information available.
    MinAug	Minimal/augmented system. 1 = present, 0 = absent. 1? = probably present
    """
    value_map = {
        '0': 'absent',
        '1': 'present',
        '1?': 'probably present',
        'NA': 'no information available'}
    name_map = OrderedDict()
    name_map['Incl'] = 'Inclusive/exclusive distinction'
    name_map['Belh'] = 'Belhare-type inclusive/exclusive distinction'
    name_map['MinAug'] = 'Minimal/augmented system'
    varspec = [(name, set()) for name in name_map.values()]
    rev_name_map = dict(zip(name_map.values(), name_map.keys()))

    p, contrib = param_and_contrib(
        data, 'inclusive/exclusive distinction', 'inclusive.exclusive', 2)

    DBSession.add(common.ContributionContributor(
        contribution=contrib, contributor=data['Contributor']['bickel']))
    DBSession.add(common.ContributionContributor(
        contribution=contrib, contributor=data['Contributor']['nichols']))

    allv = rows(
        args.data_file('InclExcl_ISO_bib_stripped.txt'), namedtuples=True, encoding='utf8', newline='\r')

    for lid, values in groupby(sorted(allv, key=lambda j: j.LID), lambda i: i.LID):
        vsid = '%s-%s' % (p.id, lid)
        values = list(values)

        if vsid not in data['ValueSet']:
            vs = data.add(
                common.ValueSet, vsid,
                id=vsid,
                language=data['Languoid'][lid],
                contribution=contrib,
                parameter=p)
        else:
            vs = data['ValueSet'][vsid]

        bibkeys = []
        for v in values:
            bibkeys.extend(filter(None, [v.strip() for v in v.bibkey.split(',')]))

        for key in set(bibkeys):
            if key in data['Source']:
                source = data['Source'][key]
            else:
                if key in bib.keymap:
                    source = data.add(common.Source, key, _obj=bibtex2source(bib[key]))
                else:
                    print key

                    source = None
            if source:
                DBSession.add(common.ValueSetReference(valueset=vs, source=source))

        for i, value in enumerate(values):
            if i > 0:
                print 'multiuple values!'
                raise ValueError
            value_data = OrderedDict()
            for var in name_map.keys():
                val = value_map.get(getattr(value, var))
                if not val:
                    print getattr(value, var)
                    raise ValueError
                value_data[var] = val
            v = data.add(
                common.Value, vsid,
                id=vsid,
                name=' / '.join(value_data.values()),
                #jsondata=value,
                valueset=vs)
            DBSession.flush()
            for j, spec in enumerate(varspec):
                attr, domain = spec
                domain.add(value_data[rev_name_map[attr]])
                DBSession.add(common.Value_data(key=attr, value=value_data[rev_name_map[attr]], ord=j, object_pk=v.pk))

    p.jsondata = {'varspec': [(name, list(domain)) for name, domain in varspec]}