def main(args): # pragma: no cover global MAX_IDENTIFIER_PK with transaction.manager: MAX_IDENTIFIER_PK = DBSession.query( Identifier.pk).order_by(desc(Identifier.pk)).first()[0] gl_name = glottolog_name() gl_names = glottolog_names() languoids = {l.pk: l for l in DBSession.query(Languoid)} for attrs in jsonload(args.data_dir.joinpath('languoids', 'changes.json')): replacement = attrs.pop('replacement', None) hname = attrs.pop('hname', None) for name, enum in [('level', LanguoidLevel), ('status', LanguoidStatus)]: if name in attrs: attrs[name] = enum.from_string(attrs[name]) l = languoids.get(attrs['pk']) if l: for k, v in attrs.items(): setattr(l, k, v) # # We do not assign ISO codes for existing languages, because it could be # that the ISO code is now assigned to a family node, due to a change # request, e.g. see https://github.com/clld/glottolog-data/issues/40 # if len(l.hid or '') == 3 and not l.iso_code: args.log.warn('Language with hid %s but no iso code!' % l.hid) else: l = Languoid(**attrs) DBSession.add(l) languoids[l.pk] = l if len(attrs.get('hid', '')) == 3: create_identifier( None, l, name=attrs['hid'], type=IdentifierType.iso.value) create_identifier( gl_names.get(l.name), l, name=l.name, description=gl_name.description, type=gl_name.type) if hname: l.update_jsondata(hname=hname) if replacement: DBSession.add(Superseded( languoid_pk=l.pk, replacement_pk=replacement, relation='classification update')) DBSession.flush() recreate_treeclosure()
def languages(request): if request.params.get('search'): return quicksearch(request) res = dict( countries=json.dumps([ '%s (%s)' % (c.name, c.id) for c in DBSession.query(Country).order_by(Country.description)]), params={ 'name': '', 'iso': '', 'namequerytype': 'part', 'country': ''}, message=None) for param, default in res['params'].items(): res['params'][param] = request.params.get(param, default).strip() if res['params']['country']: country = res['params']['country'] try: alpha2 = country.split('(')[1].split(')')[0] \ if len(country) > 2 else country.upper() raise HTTPFound(location=request.route_url( 'languages_alt', ext='map.html', _query=dict(country=alpha2))) except IndexError: pass res['params']['multilingual'] = 'multilingual' in request.params if request.params.get('alnum'): l = Languoid.get(request.params.get('alnum'), default=None) if l: raise HTTPFound(location=request.resource_url(l)) res['message'] = 'No matching languoids found' if (res['params']['iso'] and len(res['params']['iso']) < 2) or ( res['params']['name'] and len(res['params']['name']) < 2 and res['params']['namequerytype'] == 'part'): res.update( message='Please enter at least two characters to search', map=None, languoids=[]) return res languoids = list(getLanguoids(**res['params'])) if not languoids and \ (res['params']['name'] or res['params']['iso'] or res['params']['country']): res['message'] = 'No matching languoids found' #if len(languoids) == 1: # raise HTTPFound(request.resource_url(languoids[0])) map_, icon_map, family_map = get_selected_languages_map(request, languoids) layer = list(map_.get_layers())[0] if not layer.data['features']: map_ = None res.update(map=map_, languoids=languoids) return res
def update(args): count = 0 assert args.json iid = int(DBSession.execute( "select max(cast(id as integer)) from identifier").fetchone()[0]) + 1 pk = DBSession.execute( "select max(pk) from identifier").fetchone()[0] + 1 langs = {} for gid, name in args.json['wikipedia'].items(): if gid not in langs: langs[gid] = Languoid.get(gid) langs[gid].update_jsondata(wikipedia=name.split('/')[-1]) for gid, codes in args.json['multitree'].items(): l = langs[gid] lcodes = [i.name for i in l.identifiers if i.type == 'multitree'] for code in set(codes): if code not in lcodes: identifier = DBSession.query(common.Identifier)\ .filter(common.Identifier.type == 'multitree')\ .filter(common.Identifier.name == code)\ .first() if not identifier: identifier = common.Identifier( pk=pk, id=str(iid), name=code, type='multitree') iid += 1 pk += 1 count += 1 DBSession.add( common.LanguageIdentifier(language=l, identifier=identifier)) print count, 'new multitree identifiers'
def language_index_html(request=None, **kw): res = dict(countries=dumps([ '%s (%s)' % (c.name, c.id) for c in DBSession.query(Country).order_by(Country.description) ]), params={ 'name': '', 'iso': '', 'namequerytype': 'part', 'country': '' }, message=None) for param, default in res['params'].items(): res['params'][param] = request.params.get(param, default).strip() res['params']['multilingual'] = 'multilingual' in request.params if request.params.get('alnum'): l = Languoid.get(request.params.get('alnum'), default=None) if l: raise HTTPFound(location=request.resource_url(l)) res['message'] = 'No matching languoids found' languoids = list(getLanguoids(**res['params'])) if not languoids and \ (res['params']['name'] or res['params']['iso'] or res['params']['country']): res['message'] = 'No matching languoids found' map_ = LanguoidsMap(languoids, request) layer = list(map_.get_layers())[0] if not layer.data['features']: map_ = None res.update(map=map_, languoids=languoids) return res
def macroareas(args, languages, stats): ma_map = get_map(Macroarea) # we store references to languages to make computation of cumulated macroareas for # families easier lang_map = {} for hid, info in get_lginfo(args, lambda x: x.macro_area): if hid not in languages: languages[hid] = Languoid.get(hid, key='hid', default=None) if not languages[hid]: continue lang_map[languages[hid].pk] = languages[hid] a, r = update_relationship(languages[hid].macroareas, [ma_map[info.macro_area]]) if a or r: stats.update(['macroarea']) for family in DBSession.query(Languoid)\ .filter(Languoid.level == LanguoidLevel.family)\ .filter(Language.active == true()): mas = [] for lang in DBSession.query(TreeClosureTable.child_pk)\ .filter(TreeClosureTable.parent_pk == family.pk): if lang[0] in lang_map: mas.extend(lang_map[lang[0]].macroareas) a, r = update_relationship(family.macroareas, mas) if a or r: stats.update(['macroarea']) args.log.info('macroareas done')
def update(args): res = defaultdict(lambda: 0) for i, spec in enumerate(args.json): action = spec.pop('action') if action == 'update': update_lang(Languoid.get(spec.pop('id')), **spec) res[action] += 1 for k, v in res.items(): print k, v
def update(args): pid, cid = 'vitality', 'unesco' count = 0 notfound = {} contrib = common.Contribution.get(cid, default=None) if not contrib: contrib = common.Contribution( id=cid, name='Atlas of the World’s Languages in Danger', description='Atlas of the World’s Languages in Danger, © UNESCO, http://www.unesco.org/culture/languages-atlas') param = common.Parameter.get(pid, default=None) if param is None: param = common.Parameter( id=pid, name='Degree of endangerment') domain = {de.name: de for de in param.domain} for i, spec in enumerate(VITALITY_VALUES): name, desc = spec if name not in domain: number = i + 1 domain[name] = common.DomainElement( id='%s-%s' % (pid, number), name=name, description=desc, number=number, parameter=param) valuesets = {vs.id: vs for vs in param.valuesets} for item in reader(args.data_file(DATA_FILE), dicts=True): if item['ISO639-3 codes']: for code in item['ISO639-3 codes'].split(','): code = code.strip() lang = Languoid.get(code, key='hid', default=None) if lang: count += 1 item['url'] = 'http://www.unesco.org/culture/languages-atlas/en/atlasmap/language-iso-%s.html' % code lang.update_jsondata(unesco=item) de = domain[item['Degree of endangerment']] vsid = '%s-%s' % (pid, lang.id) vs = valuesets.get(vsid) if not vs: vs = common.ValueSet( id='vitality-%s' % lang.id, parameter=param, contribution=contrib, language=lang) DBSession.add(common.Value(valueset=vs, name=de.name, domainelement=de)) valuesets[vsid] = vs else: vs.values[0].domainelement = de else: notfound[code] = 1 print 'assigned', count, 'unesco urls' print 'missing iso codes:', notfound
def add_languoid(request): json_data = request.json_body try: data, errors = LanguoidSchema().load(json_data) except ValueError: request.response.status = 400 return {'error': 'Not a valid languoid level'} if errors: request.response.status = 400 return {'error': errors} try: DBSession.add(Languoid(**data)) DBSession.flush() except exc.SQLAlchemyError as e: request.response.status = 400 DBSession.rollback() return {'error': "{}".format(e)} request.response.status = 201 return LanguoidSchema().dump(Languoid(**data)).data
def coordinates(args, languages): diff = lambda x, y: abs(x - y) > 0.001 for hid, lon, lat in dsv.reader(args.data_file("coordinates.tab")): if hid not in languages: languages[hid] = Languoid.get(hid, key="hid", default=None) if not languages[hid]: continue language = languages[hid] lat, lon = map(float, [lat, lon]) if not language.latitude or not language.longitude: language.longitude, language.latitude = lon, lat args.log.info("++ %s" % language.id) elif diff(language.longitude, lon) or diff(language.latitude, lat): language.longitude, language.latitude = lon, lat args.log.info("~~ %s" % language.id)
def coordinates(args, languages, stats): diff = lambda x, y: abs(x - y) > 0.001 for hid, info in get_lginfo(args, lambda x: x.longitude and x.latitude): if hid not in languages: languages[hid] = Languoid.get(hid, key='hid', default=None) if not languages[hid]: continue language = languages[hid] lat, lon = map(float, [info.latitude, info.longitude]) if not language.latitude or not language.longitude: language.longitude, language.latitude = lon, lat args.log.info('++ %s' % language.id) stats.update(['coordinates_new']) elif diff(language.longitude, lon) or diff(language.latitude, lat): language.longitude, language.latitude = lon, lat args.log.info('~~ %s' % language.id) stats.update(['coordinates_changed'])
def countries(args, languages, stats): """update relations between languages and countries they are spoken in. """ cname_map = { 'Tanzania': 'Tanzania, United Republic of', 'Russia': 'Russian Federation', 'South Korea': 'Korea, Republic of', 'Iran': 'Iran, Islamic Republic of', 'Syria': 'Syrian Arab Republic', 'Laos': "Lao People's Democratic Republic", r"C\^ote d'Ivoire": "Côte d'Ivoire", 'British Virgin Islands': 'Virgin Islands, British', 'Bolivia': 'Bolivia, Plurinational State of', 'Venezuela': 'Venezuela, Bolivarian Republic of', 'Democratic Republic of the Congo': 'Congo, The Democratic Republic of the', 'Micronesia': 'Micronesia, Federated States of', } countries = {} for row in dsv.reader( args.data_dir.joinpath('languoids', 'forkel_countries.tab'), encoding='latin1'): hid, cnames = row[0], row[1:] if hid not in languages: languages[hid] = Languoid.get(hid, key='hid', default=None) if not languages[hid]: args.log.warn('unknown hid in countries.tab: %s' % hid) continue l = languages[hid] if l.countries: # we only add country relations to new languages or languages which have none. continue for cname in set(cnames): if cname not in countries: q = cname if '(' not in cname else cname.split('(')[0].strip() countries[cname] = Country.get(cname_map.get(q, q), key='name', default=None) if not countries[cname]: args.log.warn('unknown country name in countries.tab: %s' % cname) continue c = countries[cname] if c.id not in [_c.id for _c in l.countries]: l.countries.append(c) stats.update(['countries'])
def update(args): count = 0 assert args.json iid = int( DBSession.execute("select max(cast(id as integer)) from identifier"). fetchone()[0]) + 1 pk = DBSession.execute("select max(pk) from identifier").fetchone()[0] + 1 langs = {} for gid, name in args.json['wikipedia'].items(): if gid not in langs: langs[gid] = Languoid.get(gid) langs[gid].update_jsondata(wikipedia=name.split('/')[-1]) for gid, codes in args.json['multitree'].items(): l = langs[gid] lcodes = [i.name for i in l.identifiers if i.type == 'multitree'] for code in set(codes): if code not in lcodes: identifier = DBSession.query(common.Identifier)\ .filter(common.Identifier.type == 'multitree')\ .filter(common.Identifier.name == code)\ .first() if not identifier: identifier = common.Identifier(pk=pk, id=str(iid), name=code, type='multitree') iid += 1 pk += 1 count += 1 DBSession.add( common.LanguageIdentifier(language=l, identifier=identifier)) print count, 'new multitree identifiers'
def countries(args, languages): count = 0 countries = {} for row in dsv.reader(args.data_file("countries.tab"), encoding="latin1"): hid, cnames = row[0], row[1:] if hid not in languages: languages[hid] = Languoid.get(hid, key="hid", default=None) if not languages[hid]: continue l = languages[hid] if l.countries: continue for cname in set(cnames): if cname not in countries: countries[cname] = Country.get(cname, key="name", default=None) if not countries[cname]: continue c = countries[cname] if c.id not in [_c.id for _c in l.countries]: l.countries.append(c) count += 1 print "countries:", count, "relations added"
def macroareas(args, languages): ma_map = get_map(Macroarea) # we store references to languages to make computation of cumulated macroareas for # families easier lang_map = {} for hid, macroarea in dsv.reader(args.data_file("macroareas.tab")): if hid not in languages: languages[hid] = Languoid.get(hid, key="hid", default=None) if not languages[hid]: continue lang_map[languages[hid].pk] = languages[hid] update_relationship(languages[hid].macroareas, [ma_map[macroarea]], log=args.log) for family in ( DBSession.query(Languoid).filter(Languoid.level == LanguoidLevel.family).filter(Language.active == True) ): mas = [] for lang in DBSession.query(TreeClosureTable.child_pk).filter(TreeClosureTable.parent_pk == family.pk): if lang[0] in lang_map: mas.extend(lang_map[lang[0]].macroareas) update_relationship(family.macroareas, mas, log=args.log) print "macroareas done"
def main(args): # pragma: no cover global MAX_IDENTIFIER_PK with transaction.manager: MAX_IDENTIFIER_PK = DBSession.query( Identifier.pk).order_by(desc(Identifier.pk)).first()[0] gc_names = {i.name: i for i in DBSession.query(Identifier).filter( Identifier.type == 'name').filter(Identifier.description == 'Glottolog')} ma_map = get_map(Macroarea) languoids = dict((l.pk, l) for l in DBSession.query(Languoid)) with open(args.data_file(args.version, 'languoids.json')) as fp: for attrs in json.load(fp): ma = attrs.pop('macroarea', None) replacement = attrs.pop('replacement', None) hname = attrs.pop('hname', None) for name, enum in [('level', LanguoidLevel), ('status', LanguoidStatus)]: if name in attrs: attrs[name] = enum.from_string(attrs[name]) l = languoids.get(attrs['pk']) if l: for k, v in attrs.items(): if k == 'globalclassificationcomment': continue setattr(l, k, v) if len(l.hid or '') == 3: if not l.iso_code: create_identifier( None, l, name=l.hid, type=IdentifierType.iso.value) else: l = Languoid(**attrs) DBSession.add(l) languoids[l.pk] = l if len(attrs.get('hid', '')) == 3: create_identifier( None, l, name=attrs['hid'], type=IdentifierType.iso.value) if ma: l.macroareas.append(ma_map[ma]) create_identifier( gc_names.get(l.name), l, name=l.name, description='Glottolog', type='name') if hname: l.update_jsondata(hname=hname) if replacement: DBSession.add(Superseded( languoid_pk=l.pk, replacement_pk=replacement, relation='classification update')) DBSession.flush() recreate_treeclosure()
def update(args): author = 'ISO 639-3 Registration Authority' pid = 'iso6393' dtid = 'overview' dt = Doctype.get(dtid) provider = Provider.get(pid, default=None) if provider is None: provider = Provider( id=pid, abbr=pid, name=author, description="Change requests submitted to the ISO 639-3 registration authority.") iid = max(int(DBSession.execute( "select max(cast(id as integer)) from source").fetchone()[0]), 500000) pk = int(DBSession.execute("select max(pk) from source").fetchone()[0]) for crno, affected in args.json['changerequests'].items(): year, serial = crno.split('-') title = 'Change Request Number %s' % crno ref = Ref.get(title, key='title', default=None) if not ref: iid += 1 pk += 1 ref = Ref( pk=pk, id=str(iid), name='%s %s' % (author, year), bibtex_type=EntryType.misc, number=crno, description=title, year=year, year_int=int(year), title=title, author=author, address='Dallas', publisher='SIL International', url='http://www.sil.org/iso639-3/cr_files/%s.pdf' % crno, doctypes_str=dtid, providers_str=pid, language_note=', '.join('%(Language Name)s [%(Affected Identifier)s]' % spec for spec in affected), jsondata=dict(hhtype=dtid, src=pid)) ref.doctypes.append(dt) ref.providers.append(provider) for spec in affected: lang = Languoid.get(spec['Affected Identifier'], key='hid', default=None) if lang and lang not in ref.languages: ref.languages.append(lang) DBSession.add(ref) transaction.commit() transaction.begin() matched = 0 near = 0 max_identifier_pk = DBSession.query( Identifier.pk).order_by(desc(Identifier.pk)).first()[0] families = [] for family in DBSession.query(Languoid)\ .filter(Languoid.level == LanguoidLevel.family)\ .filter(Language.active == True)\ .all(): isoleafs = set() for row in DBSession.query(TreeClosureTable.child_pk, Languoid.hid)\ .filter(family.pk == TreeClosureTable.parent_pk)\ .filter(Languoid.pk == TreeClosureTable.child_pk)\ .filter(Languoid.hid != None)\ .filter(Languoid.level == LanguoidLevel.language)\ .filter(Languoid.status == LanguoidStatus.established)\ .all(): if len(row[1]) == 3: isoleafs.add(row[1]) families.append((family, isoleafs)) families = sorted(families, key=lambda p: len(p[1])) for mid, leafs in args.json['macrolanguages'].items(): leafs = set(leafs) found = False for family, isoleafs in families: if leafs == isoleafs: if mid not in [c.name for c in family.identifiers if c.type == IdentifierType.iso.value]: family.codes.append(Identifier( id=str(max_identifier_pk + 1), name=mid, type=IdentifierType.iso.value)) max_identifier_pk += 1 matched += 1 found = True break elif leafs.issubset(isoleafs): print '~~~', family.name, '-->', mid, 'distance:', len(leafs), len(isoleafs) near += 1 found = True break if not found: print '---', mid, leafs print 'matched', matched, 'of', len(args.json['macrolanguages']), 'macrolangs' print near
def update(args): pid, cid = 'vitality', 'unesco' count = 0 notfound = {} contrib = common.Contribution.get(cid, default=None) if not contrib: contrib = common.Contribution( id=cid, name='Atlas of the World’s Languages in Danger', description= 'Atlas of the World’s Languages in Danger, © UNESCO, http://www.unesco.org/culture/languages-atlas' ) param = common.Parameter.get(pid, default=None) if param is None: param = common.Parameter(id=pid, name='Degree of endangerment') domain = {de.name: de for de in param.domain} for i, spec in enumerate(VITALITY_VALUES): name, desc = spec if name not in domain: number = i + 1 domain[name] = common.DomainElement(id='%s-%s' % (pid, number), name=name, description=desc, number=number, parameter=param) valuesets = {vs.id: vs for vs in param.valuesets} for record in et.parse(args.data_file(DATA_FILE)).findall('.//RECORD'): item = {} for attr in [ 'ID', 'Name in English', 'Name in French', 'Name in Spanish', 'Countries', 'Country codes alpha 3', 'ISO639-3 codes', 'Degree of endangerment' ]: item[attr] = record.find(attr.replace(' ', '_')).text if item['ISO639-3 codes']: for code in item['ISO639-3 codes'].split(','): code = code.strip() lang = Languoid.get(code, key='hid', default=None) if lang: count += 1 item[ 'url'] = 'http://www.unesco.org/culture/languages-atlas/en/atlasmap/language-iso-%s.html' % code lang.update_jsondata(unesco=item) de = domain[item['Degree of endangerment']] vsid = '%s-%s' % (pid, lang.id) vs = valuesets.get(vsid) if not vs: vs = common.ValueSet(id='vitality-%s' % lang.id, parameter=param, contribution=contrib, language=lang) DBSession.add( common.Value(valueset=vs, name=de.name, domainelement=de)) valuesets[vsid] = vs else: vs.values[0].domainelement = de else: notfound[code] = 1 print 'assigned', count, 'unesco urls' print 'missing iso codes:', notfound
def update_reflang(args): stats = Counter() brugmann_noderefs = jsonload(args.data_dir.joinpath('languoid_refs.json')) languoid_map = {} for l in DBSession.query(Languoid).options(joinedload_all( Language.languageidentifier, LanguageIdentifier.identifier )): if l.hid: languoid_map[l.hid] = l.pk elif l.iso_code: languoid_map[l.iso_code] = l.pk languoid_map[l.id] = l.pk lgcodes = {} for rec in get_bib(args): lgcode = '' for f in 'lgcode lcode lgcde lgcoe lgcosw'.split(): if rec.get(f): lgcode = rec[f] break if len(lgcode) == 3 or lgcode.startswith('NOCODE_'): lgcode = '[' + lgcode + ']' lgcodes[rec.get('glottolog_ref_id', None)] = lgcode for ref in page_query( DBSession.query(Ref).order_by(desc(Source.pk)), n=10000, commit=True, verbose=True): # disregard iso change requests: if ref.description and ref.description.startswith('Change Request Number '): stats.update(['ignored']) continue if ref.id not in lgcodes: # remove all language relations for refs no longer in bib! update_relationship(ref.languages, []) stats.update(['obsolete']) continue language_note = lgcodes[ref.id] trigger = ca_trigger(language_note) if trigger: ref.ca_language_trigger, ref.language_note = trigger else: ref.language_note = language_note remove = brugmann_noderefs['delete'].get(str(ref.pk), []) # keep relations to non-language languoids: # FIXME: adapt this for bib-entries now referring to glottocodes of # families/dialects (e.g. add a sticky-bit to languagesource) langs = [ l for l in ref.languages if (l.level != LanguoidLevel.language or not l.active) and l.pk not in remove] langs_pk = [l.pk for l in langs] # add relations from filemaker data: for lpk in brugmann_noderefs['create'].get(str(ref.pk), []): if lpk not in langs_pk: l = Languoid.get(lpk, default=None) if l: langs.append(l) langs_pk.append(l.pk) else: args.log.warn('brugmann relation for non-existing languoid %s' % lpk) for code in set(get_codes(ref)): if code not in languoid_map: stats.update([code]) continue lpk = languoid_map[code] if lpk in remove: print(ref.name, ref.id, '--', l.name, l.id) print('relation removed according to brugmann data') else: if lpk not in langs_pk: langs.append(DBSession.query(Languoid).get(lpk)) langs_pk.append(lpk) a, r = update_relationship(ref.languages, langs) if a or r: stats.update(['changed']) args.log.info('%s' % stats)
def update(args): author = 'ISO 639-3 Registration Authority' pid = 'iso6393' dtid = 'overview' dt = Doctype.get(dtid) provider = Provider.get(pid, default=None) if provider is None: provider = Provider( id=pid, abbr=pid, name=author, description= "Change requests submitted to the ISO 639-3 registration authority." ) iid = max( int( DBSession.execute( "select max(cast(id as integer)) from source").fetchone()[0]), 500000) pk = int(DBSession.execute("select max(pk) from source").fetchone()[0]) for crno, affected in args.json['changerequests'].items(): year, serial = crno.split('-') title = 'Change Request Number %s' % crno ref = Ref.get(title, key='title', default=None) if not ref: iid += 1 pk += 1 ref = Ref(pk=pk, id=str(iid), name='%s %s' % (author, year), bibtex_type=EntryType.misc, number=crno, description=title, year=year, year_int=int(year), title=title, author=author, address='Dallas', publisher='SIL International', url='http://www.sil.org/iso639-3/cr_files/%s.pdf' % crno, language_note=', '.join( '%(Language Name)s [%(Affected Identifier)s]' % spec for spec in affected), jsondata=dict(hhtype=dtid, src=pid)) ref.doctypes.append(dt) ref.providers.append(provider) for spec in affected: lang = Languoid.get(spec['Affected Identifier'], key='hid', default=None) if lang and lang not in ref.languages: ref.languages.append(lang) DBSession.add(ref) transaction.commit() transaction.begin() matched = 0 near = 0 max_identifier_pk = DBSession.query(Identifier.pk).order_by( desc(Identifier.pk)).first()[0] families = [] for family in DBSession.query(Languoid)\ .filter(Languoid.level == LanguoidLevel.family)\ .filter(Language.active == True)\ .all(): isoleafs = set() for row in DBSession.query(TreeClosureTable.child_pk, Languoid.hid)\ .filter(family.pk == TreeClosureTable.parent_pk)\ .filter(Languoid.pk == TreeClosureTable.child_pk)\ .filter(Languoid.hid != None)\ .filter(Languoid.level == LanguoidLevel.language)\ .filter(Languoid.status == LanguoidStatus.established)\ .all(): if len(row[1]) == 3: isoleafs.add(row[1]) families.append((family, isoleafs)) families = sorted(families, key=lambda p: len(p[1])) for mid, leafs in args.json['macrolanguages'].items(): leafs = set(leafs) found = False for family, isoleafs in families: if leafs == isoleafs: if mid not in [ c.name for c in family.identifiers if c.type == IdentifierType.iso.value ]: family.codes.append( Identifier(id=str(max_identifier_pk + 1), name=mid, type=IdentifierType.iso.value)) max_identifier_pk += 1 matched += 1 found = True break elif leafs.issubset(isoleafs): print '~~~', family.name, '-->', mid, 'distance:', len( leafs), len(isoleafs) near += 1 found = True break if not found: print '---', mid, leafs print 'matched', matched, 'of', len( args.json['macrolanguages']), 'macrolangs' print near
def update_reflang(args): stats = Counter() brugmann_noderefs = jsonload(args.data_dir.joinpath('languoid_refs.json')) languoid_map = {} for l in DBSession.query(Languoid).options( joinedload_all(Language.languageidentifier, LanguageIdentifier.identifier)): if l.hid: languoid_map[l.hid] = l.pk elif l.iso_code: languoid_map[l.iso_code] = l.pk languoid_map[l.id] = l.pk lgcodes = {} for rec in get_bib(args): lgcode = '' for f in 'lgcode lcode lgcde lgcoe lgcosw'.split(): if rec.get(f): lgcode = rec[f] break if len(lgcode) == 3 or lgcode.startswith('NOCODE_'): lgcode = '[' + lgcode + ']' lgcodes[rec.get('glottolog_ref_id', None)] = lgcode for ref in page_query(DBSession.query(Ref).order_by(desc(Source.pk)), n=10000, commit=True, verbose=True): # disregard iso change requests: if ref.description and ref.description.startswith( 'Change Request Number '): stats.update(['ignored']) continue if ref.id not in lgcodes: # remove all language relations for refs no longer in bib! update_relationship(ref.languages, []) stats.update(['obsolete']) continue language_note = lgcodes[ref.id] trigger = ca_trigger(language_note) if trigger: ref.ca_language_trigger, ref.language_note = trigger else: ref.language_note = language_note remove = brugmann_noderefs['delete'].get(str(ref.pk), []) # keep relations to non-language languoids: # FIXME: adapt this for bib-entries now referring to glottocodes of # families/dialects (e.g. add a sticky-bit to languagesource) langs = [ l for l in ref.languages if (l.level != LanguoidLevel.language or not l.active) and l.pk not in remove ] langs_pk = [l.pk for l in langs] # add relations from filemaker data: for lpk in brugmann_noderefs['create'].get(str(ref.pk), []): if lpk not in langs_pk: l = Languoid.get(lpk, default=None) if l: langs.append(l) langs_pk.append(l.pk) else: args.log.warn( 'brugmann relation for non-existing languoid %s' % lpk) for code in set(get_codes(ref)): if code not in languoid_map: stats.update([code]) continue lpk = languoid_map[code] if lpk in remove: print(ref.name, ref.id, '--', l.name, l.id) print('relation removed according to brugmann data') else: if lpk not in langs_pk: langs.append(DBSession.query(Languoid).get(lpk)) langs_pk.append(lpk) a, r = update_relationship(ref.languages, langs) if a or r: stats.update(['changed']) args.log.info('%s' % stats)
def main(args): # pragma: no cover with transaction.manager: max_identifier_pk = DBSession.query(Identifier.pk).order_by( desc(Identifier.pk)).first()[0] ma_map = get_map(Macroarea) languoids = dict((l.pk, l) for l in DBSession.query(Languoid)) with open(args.data_file('languoids.json')) as fp: for attrs in json.load(fp): ma = attrs.pop('macroarea', None) replacement = attrs.pop('replacement', None) hname = attrs.pop('hname', None) l = languoids.get(attrs['pk']) if l: for k, v in attrs.items(): if k == 'globalclassificationcomment': continue cv = getattr(l, k) if isinstance(cv, EnumSymbol): cv = cv.value assert v == cv #setattr(l, k, v) if len(l.hid or '') == 3: assert l.iso_code #if not l.iso_code: # l.identifiers.append( # Identifier( # id=str(max_identifier_pk + 1), # name=l.hid, # type=IdentifierType.iso.value)) # max_identifier_pk += 1 else: raise ValueError() try: l = Languoid(**attrs) except Exception: print attrs raise DBSession.add(l) languoids[l.pk] = l if len(attrs.get('hid', '')) == 3: l.identifiers.append( Identifier(id=str(max_identifier_pk + 1), name=attrs['hid'], type=IdentifierType.iso.value)) max_identifier_pk += 1 if ma: l.macroareas.append(ma_map[ma]) l.identifiers.append( Identifier(id=str(max_identifier_pk + 1), name=l.name, description='Glottolog', type='name')) max_identifier_pk += 1 if hname: assert l.jsondata['hname'] == hname #l.hname = hname if replacement: raise ValueError() DBSession.add( Superseded(languoid_pk=l.pk, replacement_pk=replacement, relation='classification update'))
def update_reflang(args): with open(args.data_file('brugmann_noderefs.json')) as fp: brugmann_noderefs = json.load(fp) ignored, obsolete, changed, unknown = 0, 0, 0, {} languoid_map = {} for l in DBSession.query(Languoid).filter(Languoid.hid != None): languoid_map[l.hid] = l.pk lgcodes = {} for rec in Database.from_file( args.data_file(args.version, 'refs.bib'), encoding='utf8'): lgcode = '' for f in 'lgcode lcode lgcde lgcoe lgcosw'.split(): if rec.get(f): lgcode = rec[f] break if len(lgcode) == 3 or lgcode.startswith('NOCODE_'): lgcode = '[' + lgcode + ']' lgcodes[rec.get('glottolog_ref_id', None)] = lgcode #for ref in DBSession.query(Ref).order_by(desc(Source.pk)).limit(10000): for ref in page_query( DBSession.query(Ref).order_by(desc(Source.pk)), n=10000, commit=True, verbose=True): # disregard iso change requests: if ref.description and ref.description.startswith('Change Request Number '): ignored += 1 continue if ref.id not in lgcodes: # remove all language relations for refs no longer in bib! update_relationship(ref.languages, []) obsolete += 1 continue language_note = lgcodes[ref.id] trigger = ca_trigger(language_note) if trigger: ref.ca_language_trigger, ref.language_note = trigger else: ref.language_note = language_note remove = brugmann_noderefs['delete'].get(str(ref.pk), []) # keep relations to non-language languoids: langs = [ l for l in ref.languages if (l.level != LanguoidLevel.language or not l.active) and l.pk not in remove] langs_pk = [l.pk for l in langs] # add relations from filemaker data: for lpk in brugmann_noderefs['create'].get(str(ref.pk), []): if lpk not in langs_pk: l = Languoid.get(lpk, default=None) if l: #print 'relation added according to brugmann data' langs.append(l) langs_pk.append(l.pk) else: print 'brugmann relation for non-existing languoid' for code in set(get_codes(ref)): if code not in languoid_map: unknown[code] = 1 continue lpk = languoid_map[code] if lpk in remove: print ref.name, ref.id, '--', l.name, l.id print 'relation removed according to brugmann data' else: if lpk not in langs_pk: langs.append(DBSession.query(Languoid).get(lpk)) langs_pk.append(lpk) a, r = update_relationship(ref.languages, langs) if a or r: changed += 1 print ignored, 'ignored' print obsolete, 'obsolete' print changed, 'changed' print 'unknown codes', unknown.keys()