コード例 #1
0
ファイル: import_tree.py プロジェクト: pombredanne/glottolog3
def main(args):  # pragma: no cover
    global MAX_IDENTIFIER_PK

    with transaction.manager:
        MAX_IDENTIFIER_PK = DBSession.query(
            Identifier.pk).order_by(desc(Identifier.pk)).first()[0]

        gl_name = glottolog_name()
        gl_names = glottolog_names()

        languoids = {l.pk: l for l in DBSession.query(Languoid)}
        for attrs in jsonload(args.data_dir.joinpath('languoids', 'changes.json')):
            replacement = attrs.pop('replacement', None)
            hname = attrs.pop('hname', None)

            for name, enum in [('level', LanguoidLevel), ('status', LanguoidStatus)]:
                if name in attrs:
                    attrs[name] = enum.from_string(attrs[name])

            l = languoids.get(attrs['pk'])
            if l:
                for k, v in attrs.items():
                    setattr(l, k, v)
                #
                # We do not assign ISO codes for existing languages, because it could be
                # that the ISO code is now assigned to a family node, due to a change
                # request, e.g. see https://github.com/clld/glottolog-data/issues/40
                #
                if len(l.hid or '') == 3 and not l.iso_code:
                    args.log.warn('Language with hid %s but no iso code!' % l.hid)
            else:
                l = Languoid(**attrs)
                DBSession.add(l)
                languoids[l.pk] = l

                if len(attrs.get('hid', '')) == 3:
                    create_identifier(
                        None, l, name=attrs['hid'], type=IdentifierType.iso.value)

                create_identifier(
                    gl_names.get(l.name),
                    l,
                    name=l.name,
                    description=gl_name.description,
                    type=gl_name.type)

            if hname:
                l.update_jsondata(hname=hname)

            if replacement:
                DBSession.add(Superseded(
                    languoid_pk=l.pk,
                    replacement_pk=replacement,
                    relation='classification update'))

            DBSession.flush()

        recreate_treeclosure()
コード例 #2
0
ファイル: views.py プロジェクト: cevmartinez/glottolog3
def languages(request):
    if request.params.get('search'):
        return quicksearch(request)

    res = dict(
        countries=json.dumps([
            '%s (%s)' % (c.name, c.id) for c in
            DBSession.query(Country).order_by(Country.description)]),
        params={
            'name': '',
            'iso': '',
            'namequerytype': 'part',
            'country': ''},
        message=None)

    for param, default in res['params'].items():
        res['params'][param] = request.params.get(param, default).strip()

    if res['params']['country']:
        country = res['params']['country']
        try:
            alpha2 = country.split('(')[1].split(')')[0] \
                if len(country) > 2 else country.upper()
            raise HTTPFound(location=request.route_url(
                'languages_alt', ext='map.html', _query=dict(country=alpha2)))
        except IndexError:
            pass

    res['params']['multilingual'] = 'multilingual' in request.params

    if request.params.get('alnum'):
        l = Languoid.get(request.params.get('alnum'), default=None)
        if l:
            raise HTTPFound(location=request.resource_url(l))
        res['message'] = 'No matching languoids found'

    if (res['params']['iso'] and len(res['params']['iso']) < 2) or (
            res['params']['name']
            and len(res['params']['name']) < 2
            and res['params']['namequerytype'] == 'part'):
        res.update(
            message='Please enter at least two characters to search',
            map=None,
            languoids=[])
        return res

    languoids = list(getLanguoids(**res['params']))
    if not languoids and \
            (res['params']['name'] or res['params']['iso'] or res['params']['country']):
        res['message'] = 'No matching languoids found'
    #if len(languoids) == 1:
    #    raise HTTPFound(request.resource_url(languoids[0]))

    map_, icon_map, family_map = get_selected_languages_map(request, languoids)

    layer = list(map_.get_layers())[0]
    if not layer.data['features']:
        map_ = None
    res.update(map=map_, languoids=languoids)
    return res
コード例 #3
0
ファイル: wikipedia.py プロジェクト: kublaj/glottolog3
def update(args):
    count = 0
    assert args.json

    iid = int(DBSession.execute(
        "select max(cast(id as integer)) from identifier").fetchone()[0]) + 1
    pk = DBSession.execute(
        "select max(pk) from identifier").fetchone()[0] + 1

    langs = {}
    for gid, name in args.json['wikipedia'].items():
        if gid not in langs:
            langs[gid] = Languoid.get(gid)
        langs[gid].update_jsondata(wikipedia=name.split('/')[-1])

    for gid, codes in args.json['multitree'].items():
        l = langs[gid]
        lcodes = [i.name for i in l.identifiers if i.type == 'multitree']

        for code in set(codes):
            if code not in lcodes:
                identifier = DBSession.query(common.Identifier)\
                    .filter(common.Identifier.type == 'multitree')\
                    .filter(common.Identifier.name == code)\
                    .first()
                if not identifier:
                    identifier = common.Identifier(
                        pk=pk, id=str(iid), name=code, type='multitree')
                    iid += 1
                    pk += 1
                count += 1
                DBSession.add(
                    common.LanguageIdentifier(language=l, identifier=identifier))

    print count, 'new multitree identifiers'
コード例 #4
0
def language_index_html(request=None, **kw):
    res = dict(countries=dumps([
        '%s (%s)' % (c.name, c.id)
        for c in DBSession.query(Country).order_by(Country.description)
    ]),
               params={
                   'name': '',
                   'iso': '',
                   'namequerytype': 'part',
                   'country': ''
               },
               message=None)

    for param, default in res['params'].items():
        res['params'][param] = request.params.get(param, default).strip()

    res['params']['multilingual'] = 'multilingual' in request.params

    if request.params.get('alnum'):
        l = Languoid.get(request.params.get('alnum'), default=None)
        if l:
            raise HTTPFound(location=request.resource_url(l))
        res['message'] = 'No matching languoids found'

    languoids = list(getLanguoids(**res['params']))
    if not languoids and \
            (res['params']['name'] or res['params']['iso'] or res['params']['country']):
        res['message'] = 'No matching languoids found'
    map_ = LanguoidsMap(languoids, request)
    layer = list(map_.get_layers())[0]
    if not layer.data['features']:
        map_ = None
    res.update(map=map_, languoids=languoids)
    return res
コード例 #5
0
def macroareas(args, languages, stats):
    ma_map = get_map(Macroarea)

    # we store references to languages to make computation of cumulated macroareas for
    # families easier
    lang_map = {}
    for hid, info in get_lginfo(args, lambda x: x.macro_area):
        if hid not in languages:
            languages[hid] = Languoid.get(hid, key='hid', default=None)
        if not languages[hid]:
            continue
        lang_map[languages[hid].pk] = languages[hid]
        a, r = update_relationship(languages[hid].macroareas, [ma_map[info.macro_area]])
        if a or r:
            stats.update(['macroarea'])

    for family in DBSession.query(Languoid)\
            .filter(Languoid.level == LanguoidLevel.family)\
            .filter(Language.active == true()):
        mas = []
        for lang in DBSession.query(TreeClosureTable.child_pk)\
                .filter(TreeClosureTable.parent_pk == family.pk):
            if lang[0] in lang_map:
                mas.extend(lang_map[lang[0]].macroareas)
        a, r = update_relationship(family.macroareas, mas)
        if a or r:
            stats.update(['macroarea'])
    args.log.info('macroareas done')
コード例 #6
0
def update(args):
    res = defaultdict(lambda: 0)
    for i, spec in enumerate(args.json):
        action = spec.pop('action')
        if action == 'update':
            update_lang(Languoid.get(spec.pop('id')), **spec)
        res[action] += 1

    for k, v in res.items():
        print k, v
コード例 #7
0
ファイル: glottologcurator.py プロジェクト: kublaj/glottolog3
def update(args):
    res = defaultdict(lambda: 0)
    for i, spec in enumerate(args.json):
        action = spec.pop('action')
        if action == 'update':
            update_lang(Languoid.get(spec.pop('id')), **spec)
        res[action] += 1

    for k, v in res.items():
        print k, v
コード例 #8
0
ファイル: unesco.py プロジェクト: kublaj/glottolog3
def update(args):
    pid, cid = 'vitality', 'unesco'
    count = 0
    notfound = {}
    contrib = common.Contribution.get(cid, default=None)
    if not contrib:
        contrib = common.Contribution(
            id=cid,
            name='Atlas of the World’s Languages in Danger',
            description='Atlas of the World’s Languages in Danger, © UNESCO, http://www.unesco.org/culture/languages-atlas')
    param = common.Parameter.get(pid, default=None)
    if param is None:
        param = common.Parameter(
            id=pid,
            name='Degree of endangerment')
    domain = {de.name: de for de in param.domain}
    for i, spec in enumerate(VITALITY_VALUES):
        name, desc = spec
        if name not in domain:
            number = i + 1
            domain[name] = common.DomainElement(
                id='%s-%s' % (pid, number),
                name=name,
                description=desc,
                number=number,
                parameter=param)
    valuesets = {vs.id: vs for vs in param.valuesets}
    for item in reader(args.data_file(DATA_FILE), dicts=True):
        if item['ISO639-3 codes']:
            for code in item['ISO639-3 codes'].split(','):
                code = code.strip()
                lang = Languoid.get(code, key='hid', default=None)
                if lang:
                    count += 1
                    item['url'] = 'http://www.unesco.org/culture/languages-atlas/en/atlasmap/language-iso-%s.html' % code
                    lang.update_jsondata(unesco=item)
                    de = domain[item['Degree of endangerment']]
                    vsid = '%s-%s' % (pid, lang.id)
                    vs = valuesets.get(vsid)
                    if not vs:
                        vs = common.ValueSet(
                            id='vitality-%s' % lang.id,
                            parameter=param,
                            contribution=contrib,
                            language=lang)
                        DBSession.add(common.Value(valueset=vs, name=de.name, domainelement=de))
                        valuesets[vsid] = vs
                    else:
                        vs.values[0].domainelement = de
                else:
                    notfound[code] = 1
    print 'assigned', count, 'unesco urls'
    print 'missing iso codes:', notfound
コード例 #9
0
ファイル: views.py プロジェクト: uwblueprint/glottolog3
def add_languoid(request):
    json_data = request.json_body

    try:
        data, errors = LanguoidSchema().load(json_data)
    except ValueError:
        request.response.status = 400
        return {'error': 'Not a valid languoid level'}
    if errors:
        request.response.status = 400
        return {'error': errors}

    try:
        DBSession.add(Languoid(**data))
        DBSession.flush()
    except exc.SQLAlchemyError as e:
        request.response.status = 400
        DBSession.rollback()
        return {'error': "{}".format(e)}

    request.response.status = 201
    return LanguoidSchema().dump(Languoid(**data)).data
コード例 #10
0
def coordinates(args, languages):
    diff = lambda x, y: abs(x - y) > 0.001

    for hid, lon, lat in dsv.reader(args.data_file("coordinates.tab")):
        if hid not in languages:
            languages[hid] = Languoid.get(hid, key="hid", default=None)
        if not languages[hid]:
            continue
        language = languages[hid]
        lat, lon = map(float, [lat, lon])

        if not language.latitude or not language.longitude:
            language.longitude, language.latitude = lon, lat
            args.log.info("++ %s" % language.id)
        elif diff(language.longitude, lon) or diff(language.latitude, lat):
            language.longitude, language.latitude = lon, lat
            args.log.info("~~ %s" % language.id)
コード例 #11
0
def coordinates(args, languages, stats):
    diff = lambda x, y: abs(x - y) > 0.001

    for hid, info in get_lginfo(args, lambda x: x.longitude and x.latitude):
        if hid not in languages:
            languages[hid] = Languoid.get(hid, key='hid', default=None)
        if not languages[hid]:
            continue
        language = languages[hid]
        lat, lon = map(float, [info.latitude, info.longitude])

        if not language.latitude or not language.longitude:
            language.longitude, language.latitude = lon, lat
            args.log.info('++ %s' % language.id)
            stats.update(['coordinates_new'])
        elif diff(language.longitude, lon) or diff(language.latitude, lat):
            language.longitude, language.latitude = lon, lat
            args.log.info('~~ %s' % language.id)
            stats.update(['coordinates_changed'])
コード例 #12
0
def countries(args, languages, stats):
    """update relations between languages and countries they are spoken in.
    """
    cname_map = {
        'Tanzania': 'Tanzania, United Republic of',
        'Russia': 'Russian Federation',
        'South Korea': 'Korea, Republic of',
        'Iran': 'Iran, Islamic Republic of',
        'Syria': 'Syrian Arab Republic',
        'Laos': "Lao People's Democratic Republic",
        r"C\^ote d'Ivoire": "Côte d'Ivoire",
        'British Virgin Islands': 'Virgin Islands, British',
        'Bolivia': 'Bolivia, Plurinational State of',
        'Venezuela': 'Venezuela, Bolivarian Republic of',
        'Democratic Republic of the Congo': 'Congo, The Democratic Republic of the',
        'Micronesia': 'Micronesia, Federated States of',
    }
    countries = {}
    for row in dsv.reader(
            args.data_dir.joinpath('languoids', 'forkel_countries.tab'), encoding='latin1'):
        hid, cnames = row[0], row[1:]
        if hid not in languages:
            languages[hid] = Languoid.get(hid, key='hid', default=None)
        if not languages[hid]:
            args.log.warn('unknown hid in countries.tab: %s' % hid)
            continue
        l = languages[hid]
        if l.countries:
            # we only add country relations to new languages or languages which have none.
            continue
        for cname in set(cnames):
            if cname not in countries:
                q = cname if '(' not in cname else cname.split('(')[0].strip()
                countries[cname] = Country.get(cname_map.get(q, q), key='name', default=None)
            if not countries[cname]:
                args.log.warn('unknown country name in countries.tab: %s' % cname)
                continue
            c = countries[cname]
            if c.id not in [_c.id for _c in l.countries]:
                l.countries.append(c)
                stats.update(['countries'])
コード例 #13
0
ファイル: wikipedia.py プロジェクト: cevmartinez/glottolog3
def update(args):
    count = 0
    assert args.json

    iid = int(
        DBSession.execute("select max(cast(id as integer)) from identifier").
        fetchone()[0]) + 1
    pk = DBSession.execute("select max(pk) from identifier").fetchone()[0] + 1

    langs = {}
    for gid, name in args.json['wikipedia'].items():
        if gid not in langs:
            langs[gid] = Languoid.get(gid)
        langs[gid].update_jsondata(wikipedia=name.split('/')[-1])

    for gid, codes in args.json['multitree'].items():
        l = langs[gid]
        lcodes = [i.name for i in l.identifiers if i.type == 'multitree']

        for code in set(codes):
            if code not in lcodes:
                identifier = DBSession.query(common.Identifier)\
                    .filter(common.Identifier.type == 'multitree')\
                    .filter(common.Identifier.name == code)\
                    .first()
                if not identifier:
                    identifier = common.Identifier(pk=pk,
                                                   id=str(iid),
                                                   name=code,
                                                   type='multitree')
                    iid += 1
                    pk += 1
                count += 1
                DBSession.add(
                    common.LanguageIdentifier(language=l,
                                              identifier=identifier))

    print count, 'new multitree identifiers'
コード例 #14
0
def countries(args, languages):
    count = 0
    countries = {}
    for row in dsv.reader(args.data_file("countries.tab"), encoding="latin1"):
        hid, cnames = row[0], row[1:]
        if hid not in languages:
            languages[hid] = Languoid.get(hid, key="hid", default=None)
        if not languages[hid]:
            continue
        l = languages[hid]
        if l.countries:
            continue
        for cname in set(cnames):
            if cname not in countries:
                countries[cname] = Country.get(cname, key="name", default=None)
            if not countries[cname]:
                continue
            c = countries[cname]
            if c.id not in [_c.id for _c in l.countries]:
                l.countries.append(c)
                count += 1

    print "countries:", count, "relations added"
コード例 #15
0
def macroareas(args, languages):
    ma_map = get_map(Macroarea)

    # we store references to languages to make computation of cumulated macroareas for
    # families easier
    lang_map = {}

    for hid, macroarea in dsv.reader(args.data_file("macroareas.tab")):
        if hid not in languages:
            languages[hid] = Languoid.get(hid, key="hid", default=None)
        if not languages[hid]:
            continue
        lang_map[languages[hid].pk] = languages[hid]
        update_relationship(languages[hid].macroareas, [ma_map[macroarea]], log=args.log)

    for family in (
        DBSession.query(Languoid).filter(Languoid.level == LanguoidLevel.family).filter(Language.active == True)
    ):
        mas = []
        for lang in DBSession.query(TreeClosureTable.child_pk).filter(TreeClosureTable.parent_pk == family.pk):
            if lang[0] in lang_map:
                mas.extend(lang_map[lang[0]].macroareas)
        update_relationship(family.macroareas, mas, log=args.log)
    print "macroareas done"
コード例 #16
0
ファイル: import_tree.py プロジェクト: kublaj/glottolog3
def main(args):  # pragma: no cover
    global MAX_IDENTIFIER_PK

    with transaction.manager:
        MAX_IDENTIFIER_PK = DBSession.query(
            Identifier.pk).order_by(desc(Identifier.pk)).first()[0]

        gc_names = {i.name: i for i in DBSession.query(Identifier).filter(
            Identifier.type == 'name').filter(Identifier.description == 'Glottolog')}

        ma_map = get_map(Macroarea)
        languoids = dict((l.pk, l) for l in DBSession.query(Languoid))
        with open(args.data_file(args.version, 'languoids.json')) as fp:
            for attrs in json.load(fp):
                ma = attrs.pop('macroarea', None)
                replacement = attrs.pop('replacement', None)
                hname = attrs.pop('hname', None)

                for name, enum in [('level', LanguoidLevel), ('status', LanguoidStatus)]:
                    if name in attrs:
                        attrs[name] = enum.from_string(attrs[name])

                l = languoids.get(attrs['pk'])
                if l:
                    for k, v in attrs.items():
                        if k == 'globalclassificationcomment':
                            continue
                        setattr(l, k, v)
                    if len(l.hid or '') == 3:
                        if not l.iso_code:
                            create_identifier(
                                None, l, name=l.hid, type=IdentifierType.iso.value)
                else:
                    l = Languoid(**attrs)
                    DBSession.add(l)
                    languoids[l.pk] = l

                    if len(attrs.get('hid', '')) == 3:
                        create_identifier(
                            None, l, name=attrs['hid'], type=IdentifierType.iso.value)
                    if ma:
                        l.macroareas.append(ma_map[ma])

                    create_identifier(
                        gc_names.get(l.name),
                        l,
                        name=l.name,
                        description='Glottolog',
                        type='name')

                if hname:
                    l.update_jsondata(hname=hname)

                if replacement:
                    DBSession.add(Superseded(
                        languoid_pk=l.pk,
                        replacement_pk=replacement,
                        relation='classification update'))

                DBSession.flush()

        recreate_treeclosure()
コード例 #17
0
ファイル: iso.py プロジェクト: kublaj/glottolog3
def update(args):
    author = 'ISO 639-3 Registration Authority'
    pid = 'iso6393'
    dtid = 'overview'
    dt = Doctype.get(dtid)
    provider = Provider.get(pid, default=None)
    if provider is None:
        provider = Provider(
            id=pid,
            abbr=pid,
            name=author,
            description="Change requests submitted to the ISO 639-3 registration authority.")
    iid = max(int(DBSession.execute(
        "select max(cast(id as integer)) from source").fetchone()[0]), 500000)
    pk = int(DBSession.execute("select max(pk) from source").fetchone()[0])
    for crno, affected in args.json['changerequests'].items():
        year, serial = crno.split('-')
        title = 'Change Request Number %s' % crno
        ref = Ref.get(title, key='title', default=None)

        if not ref:
            iid += 1
            pk += 1
            ref = Ref(
                pk=pk,
                id=str(iid),
                name='%s %s' % (author, year),
                bibtex_type=EntryType.misc,
                number=crno,
                description=title,
                year=year,
                year_int=int(year),
                title=title,
                author=author,
                address='Dallas',
                publisher='SIL International',
                url='http://www.sil.org/iso639-3/cr_files/%s.pdf' % crno,
                doctypes_str=dtid,
                providers_str=pid,
                language_note=', '.join('%(Language Name)s [%(Affected Identifier)s]' % spec for spec in affected),
                jsondata=dict(hhtype=dtid, src=pid))
            ref.doctypes.append(dt)
            ref.providers.append(provider)

        for spec in affected:
            lang = Languoid.get(spec['Affected Identifier'], key='hid', default=None)
            if lang and lang not in ref.languages:
                ref.languages.append(lang)
        DBSession.add(ref)

    transaction.commit()
    transaction.begin()

    matched = 0
    near = 0
    max_identifier_pk = DBSession.query(
        Identifier.pk).order_by(desc(Identifier.pk)).first()[0]
    families = []
    for family in DBSession.query(Languoid)\
            .filter(Languoid.level == LanguoidLevel.family)\
            .filter(Language.active == True)\
            .all():
        isoleafs = set()
        for row in DBSession.query(TreeClosureTable.child_pk, Languoid.hid)\
            .filter(family.pk == TreeClosureTable.parent_pk)\
            .filter(Languoid.pk == TreeClosureTable.child_pk)\
            .filter(Languoid.hid != None)\
            .filter(Languoid.level == LanguoidLevel.language)\
            .filter(Languoid.status == LanguoidStatus.established)\
            .all():
            if len(row[1]) == 3:
                isoleafs.add(row[1])
        families.append((family, isoleafs))

    families = sorted(families, key=lambda p: len(p[1]))

    for mid, leafs in args.json['macrolanguages'].items():
        leafs = set(leafs)
        found = False
        for family, isoleafs in families:
            if leafs == isoleafs:
                if mid not in [c.name for c in family.identifiers
                               if c.type == IdentifierType.iso.value]:
                    family.codes.append(Identifier(
                        id=str(max_identifier_pk + 1),
                        name=mid,
                        type=IdentifierType.iso.value))
                    max_identifier_pk += 1
                matched += 1
                found = True
                break
            elif leafs.issubset(isoleafs):
                print '~~~', family.name, '-->', mid, 'distance:', len(leafs), len(isoleafs)
                near += 1
                found = True
                break
        if not found:
            print '---', mid, leafs

    print 'matched', matched, 'of', len(args.json['macrolanguages']), 'macrolangs'
    print near
コード例 #18
0
ファイル: unesco.py プロジェクト: cevmartinez/glottolog3
def update(args):
    pid, cid = 'vitality', 'unesco'
    count = 0
    notfound = {}
    contrib = common.Contribution.get(cid, default=None)
    if not contrib:
        contrib = common.Contribution(
            id=cid,
            name='Atlas of the World’s Languages in Danger',
            description=
            'Atlas of the World’s Languages in Danger, © UNESCO, http://www.unesco.org/culture/languages-atlas'
        )
    param = common.Parameter.get(pid, default=None)
    if param is None:
        param = common.Parameter(id=pid, name='Degree of endangerment')
    domain = {de.name: de for de in param.domain}
    for i, spec in enumerate(VITALITY_VALUES):
        name, desc = spec
        if name not in domain:
            number = i + 1
            domain[name] = common.DomainElement(id='%s-%s' % (pid, number),
                                                name=name,
                                                description=desc,
                                                number=number,
                                                parameter=param)
    valuesets = {vs.id: vs for vs in param.valuesets}

    for record in et.parse(args.data_file(DATA_FILE)).findall('.//RECORD'):
        item = {}
        for attr in [
                'ID', 'Name in English', 'Name in French', 'Name in Spanish',
                'Countries', 'Country codes alpha 3', 'ISO639-3 codes',
                'Degree of endangerment'
        ]:
            item[attr] = record.find(attr.replace(' ', '_')).text
        if item['ISO639-3 codes']:
            for code in item['ISO639-3 codes'].split(','):
                code = code.strip()
                lang = Languoid.get(code, key='hid', default=None)
                if lang:
                    count += 1
                    item[
                        'url'] = 'http://www.unesco.org/culture/languages-atlas/en/atlasmap/language-iso-%s.html' % code
                    lang.update_jsondata(unesco=item)
                    de = domain[item['Degree of endangerment']]
                    vsid = '%s-%s' % (pid, lang.id)
                    vs = valuesets.get(vsid)
                    if not vs:
                        vs = common.ValueSet(id='vitality-%s' % lang.id,
                                             parameter=param,
                                             contribution=contrib,
                                             language=lang)
                        DBSession.add(
                            common.Value(valueset=vs,
                                         name=de.name,
                                         domainelement=de))
                        valuesets[vsid] = vs
                    else:
                        vs.values[0].domainelement = de
                else:
                    notfound[code] = 1
    print 'assigned', count, 'unesco urls'
    print 'missing iso codes:', notfound
コード例 #19
0
ファイル: util.py プロジェクト: pombredanne/glottolog3
def update_reflang(args):
    stats = Counter()
    brugmann_noderefs = jsonload(args.data_dir.joinpath('languoid_refs.json'))

    languoid_map = {}
    for l in DBSession.query(Languoid).options(joinedload_all(
        Language.languageidentifier, LanguageIdentifier.identifier
    )):
        if l.hid:
            languoid_map[l.hid] = l.pk
        elif l.iso_code:
            languoid_map[l.iso_code] = l.pk
        languoid_map[l.id] = l.pk

    lgcodes = {}
    for rec in get_bib(args):
        lgcode = ''
        for f in 'lgcode lcode lgcde lgcoe lgcosw'.split():
            if rec.get(f):
                lgcode = rec[f]
                break
        if len(lgcode) == 3 or lgcode.startswith('NOCODE_'):
            lgcode = '[' + lgcode + ']'
        lgcodes[rec.get('glottolog_ref_id', None)] = lgcode

    for ref in page_query(
            DBSession.query(Ref).order_by(desc(Source.pk)),
            n=10000,
            commit=True,
            verbose=True):
        # disregard iso change requests:
        if ref.description and ref.description.startswith('Change Request Number '):
            stats.update(['ignored'])
            continue

        if ref.id not in lgcodes:
            # remove all language relations for refs no longer in bib!
            update_relationship(ref.languages, [])
            stats.update(['obsolete'])
            continue

        language_note = lgcodes[ref.id]
        trigger = ca_trigger(language_note)
        if trigger:
            ref.ca_language_trigger, ref.language_note = trigger
        else:
            ref.language_note = language_note

        remove = brugmann_noderefs['delete'].get(str(ref.pk), [])

        # keep relations to non-language languoids:
        # FIXME: adapt this for bib-entries now referring to glottocodes of
        #        families/dialects (e.g. add a sticky-bit to languagesource)
        langs = [
            l for l in ref.languages if
            (l.level != LanguoidLevel.language or not l.active) and l.pk not in remove]
        langs_pk = [l.pk for l in langs]

        # add relations from filemaker data:
        for lpk in brugmann_noderefs['create'].get(str(ref.pk), []):
            if lpk not in langs_pk:
                l = Languoid.get(lpk, default=None)
                if l:
                    langs.append(l)
                    langs_pk.append(l.pk)
                else:
                    args.log.warn('brugmann relation for non-existing languoid %s' % lpk)

        for code in set(get_codes(ref)):
            if code not in languoid_map:
                stats.update([code])
                continue
            lpk = languoid_map[code]
            if lpk in remove:
                print(ref.name, ref.id, '--', l.name, l.id)
                print('relation removed according to brugmann data')
            else:
                if lpk not in langs_pk:
                    langs.append(DBSession.query(Languoid).get(lpk))
                    langs_pk.append(lpk)

        a, r = update_relationship(ref.languages, langs)
        if a or r:
            stats.update(['changed'])

    args.log.info('%s' % stats)
コード例 #20
0
ファイル: iso.py プロジェクト: cevmartinez/glottolog3
def update(args):
    author = 'ISO 639-3 Registration Authority'
    pid = 'iso6393'
    dtid = 'overview'
    dt = Doctype.get(dtid)
    provider = Provider.get(pid, default=None)
    if provider is None:
        provider = Provider(
            id=pid,
            abbr=pid,
            name=author,
            description=
            "Change requests submitted to the ISO 639-3 registration authority."
        )
    iid = max(
        int(
            DBSession.execute(
                "select max(cast(id as integer)) from source").fetchone()[0]),
        500000)
    pk = int(DBSession.execute("select max(pk) from source").fetchone()[0])
    for crno, affected in args.json['changerequests'].items():
        year, serial = crno.split('-')
        title = 'Change Request Number %s' % crno
        ref = Ref.get(title, key='title', default=None)

        if not ref:
            iid += 1
            pk += 1
            ref = Ref(pk=pk,
                      id=str(iid),
                      name='%s %s' % (author, year),
                      bibtex_type=EntryType.misc,
                      number=crno,
                      description=title,
                      year=year,
                      year_int=int(year),
                      title=title,
                      author=author,
                      address='Dallas',
                      publisher='SIL International',
                      url='http://www.sil.org/iso639-3/cr_files/%s.pdf' % crno,
                      language_note=', '.join(
                          '%(Language Name)s [%(Affected Identifier)s]' % spec
                          for spec in affected),
                      jsondata=dict(hhtype=dtid, src=pid))
            ref.doctypes.append(dt)
            ref.providers.append(provider)

        for spec in affected:
            lang = Languoid.get(spec['Affected Identifier'],
                                key='hid',
                                default=None)
            if lang and lang not in ref.languages:
                ref.languages.append(lang)
        DBSession.add(ref)

    transaction.commit()
    transaction.begin()

    matched = 0
    near = 0
    max_identifier_pk = DBSession.query(Identifier.pk).order_by(
        desc(Identifier.pk)).first()[0]
    families = []
    for family in DBSession.query(Languoid)\
            .filter(Languoid.level == LanguoidLevel.family)\
            .filter(Language.active == True)\
            .all():
        isoleafs = set()
        for row in DBSession.query(TreeClosureTable.child_pk, Languoid.hid)\
            .filter(family.pk == TreeClosureTable.parent_pk)\
            .filter(Languoid.pk == TreeClosureTable.child_pk)\
            .filter(Languoid.hid != None)\
            .filter(Languoid.level == LanguoidLevel.language)\
            .filter(Languoid.status == LanguoidStatus.established)\
            .all():
            if len(row[1]) == 3:
                isoleafs.add(row[1])
        families.append((family, isoleafs))

    families = sorted(families, key=lambda p: len(p[1]))

    for mid, leafs in args.json['macrolanguages'].items():
        leafs = set(leafs)
        found = False
        for family, isoleafs in families:
            if leafs == isoleafs:
                if mid not in [
                        c.name for c in family.identifiers
                        if c.type == IdentifierType.iso.value
                ]:
                    family.codes.append(
                        Identifier(id=str(max_identifier_pk + 1),
                                   name=mid,
                                   type=IdentifierType.iso.value))
                    max_identifier_pk += 1
                matched += 1
                found = True
                break
            elif leafs.issubset(isoleafs):
                print '~~~', family.name, '-->', mid, 'distance:', len(
                    leafs), len(isoleafs)
                near += 1
                found = True
                break
        if not found:
            print '---', mid, leafs

    print 'matched', matched, 'of', len(
        args.json['macrolanguages']), 'macrolangs'
    print near
コード例 #21
0
ファイル: util.py プロジェクト: cevmartinez/glottolog3
def update_reflang(args):
    stats = Counter()
    brugmann_noderefs = jsonload(args.data_dir.joinpath('languoid_refs.json'))

    languoid_map = {}
    for l in DBSession.query(Languoid).options(
            joinedload_all(Language.languageidentifier,
                           LanguageIdentifier.identifier)):
        if l.hid:
            languoid_map[l.hid] = l.pk
        elif l.iso_code:
            languoid_map[l.iso_code] = l.pk
        languoid_map[l.id] = l.pk

    lgcodes = {}
    for rec in get_bib(args):
        lgcode = ''
        for f in 'lgcode lcode lgcde lgcoe lgcosw'.split():
            if rec.get(f):
                lgcode = rec[f]
                break
        if len(lgcode) == 3 or lgcode.startswith('NOCODE_'):
            lgcode = '[' + lgcode + ']'
        lgcodes[rec.get('glottolog_ref_id', None)] = lgcode

    for ref in page_query(DBSession.query(Ref).order_by(desc(Source.pk)),
                          n=10000,
                          commit=True,
                          verbose=True):
        # disregard iso change requests:
        if ref.description and ref.description.startswith(
                'Change Request Number '):
            stats.update(['ignored'])
            continue

        if ref.id not in lgcodes:
            # remove all language relations for refs no longer in bib!
            update_relationship(ref.languages, [])
            stats.update(['obsolete'])
            continue

        language_note = lgcodes[ref.id]
        trigger = ca_trigger(language_note)
        if trigger:
            ref.ca_language_trigger, ref.language_note = trigger
        else:
            ref.language_note = language_note

        remove = brugmann_noderefs['delete'].get(str(ref.pk), [])

        # keep relations to non-language languoids:
        # FIXME: adapt this for bib-entries now referring to glottocodes of
        #        families/dialects (e.g. add a sticky-bit to languagesource)
        langs = [
            l for l in ref.languages
            if (l.level != LanguoidLevel.language or not l.active)
            and l.pk not in remove
        ]
        langs_pk = [l.pk for l in langs]

        # add relations from filemaker data:
        for lpk in brugmann_noderefs['create'].get(str(ref.pk), []):
            if lpk not in langs_pk:
                l = Languoid.get(lpk, default=None)
                if l:
                    langs.append(l)
                    langs_pk.append(l.pk)
                else:
                    args.log.warn(
                        'brugmann relation for non-existing languoid %s' % lpk)

        for code in set(get_codes(ref)):
            if code not in languoid_map:
                stats.update([code])
                continue
            lpk = languoid_map[code]
            if lpk in remove:
                print(ref.name, ref.id, '--', l.name, l.id)
                print('relation removed according to brugmann data')
            else:
                if lpk not in langs_pk:
                    langs.append(DBSession.query(Languoid).get(lpk))
                    langs_pk.append(lpk)

        a, r = update_relationship(ref.languages, langs)
        if a or r:
            stats.update(['changed'])

    args.log.info('%s' % stats)
コード例 #22
0
def main(args):  # pragma: no cover
    with transaction.manager:
        max_identifier_pk = DBSession.query(Identifier.pk).order_by(
            desc(Identifier.pk)).first()[0]
        ma_map = get_map(Macroarea)
        languoids = dict((l.pk, l) for l in DBSession.query(Languoid))
        with open(args.data_file('languoids.json')) as fp:
            for attrs in json.load(fp):
                ma = attrs.pop('macroarea', None)
                replacement = attrs.pop('replacement', None)
                hname = attrs.pop('hname', None)

                l = languoids.get(attrs['pk'])
                if l:
                    for k, v in attrs.items():
                        if k == 'globalclassificationcomment':
                            continue
                        cv = getattr(l, k)
                        if isinstance(cv, EnumSymbol):
                            cv = cv.value
                        assert v == cv
                        #setattr(l, k, v)
                    if len(l.hid or '') == 3:
                        assert l.iso_code
                        #if not l.iso_code:
                        #    l.identifiers.append(
                        #        Identifier(
                        #            id=str(max_identifier_pk + 1),
                        #            name=l.hid,
                        #            type=IdentifierType.iso.value))
                        #    max_identifier_pk += 1
                else:
                    raise ValueError()
                    try:
                        l = Languoid(**attrs)
                    except Exception:
                        print attrs
                        raise
                    DBSession.add(l)
                    languoids[l.pk] = l

                    if len(attrs.get('hid', '')) == 3:
                        l.identifiers.append(
                            Identifier(id=str(max_identifier_pk + 1),
                                       name=attrs['hid'],
                                       type=IdentifierType.iso.value))
                        max_identifier_pk += 1
                    if ma:
                        l.macroareas.append(ma_map[ma])

                    l.identifiers.append(
                        Identifier(id=str(max_identifier_pk + 1),
                                   name=l.name,
                                   description='Glottolog',
                                   type='name'))
                    max_identifier_pk += 1

                if hname:
                    assert l.jsondata['hname'] == hname
                    #l.hname = hname

                if replacement:
                    raise ValueError()
                    DBSession.add(
                        Superseded(languoid_pk=l.pk,
                                   replacement_pk=replacement,
                                   relation='classification update'))
コード例 #23
0
ファイル: util.py プロジェクト: kublaj/glottolog3
def update_reflang(args):
    with open(args.data_file('brugmann_noderefs.json')) as fp:
        brugmann_noderefs = json.load(fp)

    ignored, obsolete, changed, unknown = 0, 0, 0, {}
    languoid_map = {}
    for l in DBSession.query(Languoid).filter(Languoid.hid != None):
        languoid_map[l.hid] = l.pk

    lgcodes = {}
    for rec in Database.from_file(
            args.data_file(args.version, 'refs.bib'), encoding='utf8'):
        lgcode = ''
        for f in 'lgcode lcode lgcde lgcoe lgcosw'.split():
            if rec.get(f):
                lgcode = rec[f]
                break
        if len(lgcode) == 3 or lgcode.startswith('NOCODE_'):
            lgcode = '[' + lgcode + ']'
        lgcodes[rec.get('glottolog_ref_id', None)] = lgcode

    #for ref in DBSession.query(Ref).order_by(desc(Source.pk)).limit(10000):
    for ref in page_query(
            DBSession.query(Ref).order_by(desc(Source.pk)),
            n=10000,
            commit=True,
            verbose=True):
        # disregard iso change requests:
        if ref.description and ref.description.startswith('Change Request Number '):
            ignored += 1
            continue

        if ref.id not in lgcodes:
            # remove all language relations for refs no longer in bib!
            update_relationship(ref.languages, [])
            obsolete += 1
            continue

        language_note = lgcodes[ref.id]
        trigger = ca_trigger(language_note)
        if trigger:
            ref.ca_language_trigger, ref.language_note = trigger
        else:
            ref.language_note = language_note

        remove = brugmann_noderefs['delete'].get(str(ref.pk), [])

        # keep relations to non-language languoids:
        langs = [
            l for l in ref.languages if
            (l.level != LanguoidLevel.language or not l.active) and l.pk not in remove]
        langs_pk = [l.pk for l in langs]

        # add relations from filemaker data:
        for lpk in brugmann_noderefs['create'].get(str(ref.pk), []):
            if lpk not in langs_pk:
                l = Languoid.get(lpk, default=None)
                if l:
                    #print 'relation added according to brugmann data'
                    langs.append(l)
                    langs_pk.append(l.pk)
                else:
                    print 'brugmann relation for non-existing languoid'

        for code in set(get_codes(ref)):
            if code not in languoid_map:
                unknown[code] = 1
                continue
            lpk = languoid_map[code]
            if lpk in remove:
                print ref.name, ref.id, '--', l.name, l.id
                print 'relation removed according to brugmann data'
            else:
                if lpk not in langs_pk:
                    langs.append(DBSession.query(Languoid).get(lpk))
                    langs_pk.append(lpk)

        a, r = update_relationship(ref.languages, langs)
        if a or r:
            changed += 1

    print ignored, 'ignored'
    print obsolete, 'obsolete'
    print changed, 'changed'
    print 'unknown codes', unknown.keys()