def upgrade(): conn = op.get_bind() tmpl = """\ update source set description = {0} where description is null and {0} is not null""" for col in "title booktitle".split(): conn.execute(tmpl.format(col)) for row in list(conn.execute("select pk, pages, pages_int, startpage_int from source where pages_int < 0")): pk, pages, number, start = row _start, _end, _number = compute_pages(pages) if _number > 0 and _number != number: conn.execute("update source set pages_int = %s, startpage_int = %s where pk = %s", (_number, _start, pk)) conn.execute("update ref set endpage_int = %s where pk = %s", (_end, pk))
def main(args): # pragma: no cover with transaction.manager: #for source in DBSession.query(Ref) \ # .filter(Source.pages_int == None) \ # .filter(Source.pages != ''): for source in DBSession.query(Ref)\ .filter(Source.pages_int < 0): if source.pages: start, end, number = compute_pages(source.pages) if start is not None: source.startpage_int = start if end is not None: source.endpage_int = end if number: print source.id, ':', source.pages_int, '-->', number source.pages_int = number
def upgrade(): conn = op.get_bind() tmpl = """\ update source set description = {0} where description is null and {0} is not null""" for col in 'title booktitle'.split(): conn.execute(tmpl.format(col)) for row in list( conn.execute( "select pk, pages, pages_int, startpage_int from source where pages_int < 0" )): pk, pages, number, start = row _start, _end, _number = compute_pages(pages) if _number > 0 and _number != number: conn.execute( "update source set pages_int = %s, startpage_int = %s where pk = %s", (_number, _start, pk)) conn.execute("update ref set endpage_int = %s where pk = %s", (_end, pk))
def main(args): # pragma: no cover bib = Database.from_file(args.data_file(args.version, 'refs.bib'), encoding='utf8') count = 0 skipped = 0 changes = {} with transaction.manager: update_providers(args) DBSession.flush() provider_map = get_map(Provider) macroarea_map = get_map(Macroarea) doctype_map = get_map(Doctype) known_ids = set(r[0] for r in DBSession.query(Ref.pk)) languoid_map = {} for l in DBSession.query(Languoid): if l.hid: languoid_map[l.hid] = l languoid_map[l.id] = l for i, rec in enumerate(bib): if i and i % 1000 == 0: print i, 'records done', count, 'changed' if len(rec.keys()) < 6: # not enough information! skipped += 1 continue changed = False assert rec.get('glottolog_ref_id') id_ = int(rec.get('glottolog_ref_id')) if args.mode != 'update' and id_ in known_ids: continue ref = DBSession.query(Source).get(id_) update = True if ref else False kw = { 'pk': id_, 'bibtex_type': rec.genre, 'id': str(id_), 'jsondata': {'bibtexkey': rec.id}, } for source, target in FIELD_MAP.items(): if target is None: continue value = rec.get(source) if value: value = unescape(value) if target: kw[target] = CONVERTER.get(source, lambda x: x)(value) else: kw['jsondata'][source] = value if kw['jsondata'].get('hhtype'): trigger = ca_trigger(kw['jsondata']['hhtype']) if trigger: kw['ca_doctype_trigger'], kw['jsondata']['hhtype'] = trigger # try to extract numeric year, startpage, endpage, numberofpages, ... if kw.get('year'): # prefer years in brackets over the first 4-digit number. match = PREF_YEAR_PATTERN.search(kw.get('year')) if match: kw['year_int'] = int(match.group('year')) else: match = YEAR_PATTERN.search(kw.get('year')) if match: kw['year_int'] = int(match.group('year')) if kw.get('publisher'): p = kw.get('publisher') if ':' in p: address, publisher = [s.strip() for s in kw['publisher'].split(':', 1)] if not 'address' in kw or kw['address'] == address: kw['address'], kw['publisher'] = address, publisher if rec.get('numberofpages'): try: kw['pages_int'] = int(rec.get('numberofpages').strip()) except ValueError: pass if kw.get('pages'): start, end, number = compute_pages(kw['pages']) if start is not None: kw['startpage_int'] = start if end is not None: kw['endpage_int'] = end if number is not None and 'pages_int' not in kw: kw['pages_int'] = number if update: for k in kw.keys(): if k == 'pk': continue v = getattr(ref, k) if kw[k] != v: if k == 'jsondata': d = ref.jsondata or {} d.update(**kw[k]) for s, t in FIELD_MAP.items(): if t is None and s in d: del d[s] ref.jsondata = d else: print k, '--', v print k, '++', kw[k] setattr(ref, k, kw[k]) changed = True if ref.id in changes: changes[ref.id][k] = ('%s' % v, '%s' % kw[k]) else: changes[ref.id] = {k: ('%s' % v, '%s' % kw[k])} else: changed = True ref = Ref(name='%s %s' % (kw.get('author', 'na'), kw.get('year', 'nd')), **kw) ref.description = ref.title or ref.booktitle ref.name = '%s %s' % (ref.author or 'n.a.', ref.year or 'n.d.') def append(attr, obj): if obj and obj not in attr: attr.append(obj) return True a, r = update_relationship( ref.macroareas, [macroarea_map[name] for name in set(filter(None, [s.strip() for s in kw['jsondata'].get('macro_area', '').split(',')]))]) changed = changed or a or r for name in set(filter(None, [s.strip() for s in kw['jsondata'].get('src', '').split(',')])): result = append(ref.providers, provider_map[slug(name)]) changed = changed or result a, r = update_relationship( ref.doctypes, [doctype_map[m.group('name')] for m in DOCTYPE_PATTERN.finditer(kw['jsondata'].get('hhtype', ''))]) changed = changed or a or r if not update: DBSession.add(ref) if changed: count += 1 ref.doctypes_str = ', '.join(o.id for o in ref.doctypes) ref.providers_str = ', '.join(o.id for o in ref.providers) print count, 'records updated or imported' print skipped, 'records skipped because of lack of information' DBSession.execute("update source set description = title where description is null and title is not null;") DBSession.execute("update source set description = booktitle where description is null and booktitle is not null;") for row in list(DBSession.execute( "select pk, pages, pages_int, startpage_int from source where pages_int < 0")): pk, pages, number, start = row _start, _end, _number = compute_pages(pages) if _number > 0 and _number != number: DBSession.execute( "update source set pages_int = %s, startpage_int = %s where pk = %s", (_number, _start, pk)) DBSession.execute( "update ref set endpage_int = %s where pk = %s", (_end, pk)) return changes
def load_ref(data, entry, lgcodes, lgsources): kw = {'jsondata': {}, 'language_note': entry.fields.get('lgcode')} for col in common.Source.__table__.columns: if col.name in entry.fields: kw[col.name] = entry.fields.get(col.name) for col in models.Ref.__table__.columns: if col.name in entry.fields: kw[col.name] = entry.fields.get(col.name) for col in entry.fields: if col not in kw: kw['jsondata'][col] = entry.fields[col] try: btype = EntryType.from_string(entry.type.lower()) except ValueError: btype = EntryType.misc # try to extract numeric year, startpage, endpage, numberofpages, ... if kw.get('year'): # prefer years in brackets over the first 4-digit number. match = PREF_YEAR_PATTERN.search(kw.get('year')) if match: kw['year_int'] = int(match.group('year')) else: match = YEAR_PATTERN.search(kw.get('year')) if match: kw['year_int'] = int(match.group('year')) if kw.get('publisher'): p = kw.get('publisher') if ':' in p: address, publisher = [s.strip() for s in kw['publisher'].split(':', 1)] if 'address' not in kw or kw['address'] == address: kw['address'], kw['publisher'] = address, publisher if kw.get('numberofpages'): try: kw['pages_int'] = int(kw.get('numberofpages').strip()) except ValueError: pass if kw.get('pages'): start, end, number = compute_pages(kw['pages']) if start is not None: kw['startpage_int'] = start if end is not None: kw['endpage_int'] = end if number is not None and 'pages_int' not in kw: kw['pages_int'] = number kw.update( id=entry.fields['glottolog_ref_id'], fts=fts.tsvector( '\n'.join(v for k, v in entry.fields.items() if k != 'abstract')), name='%s %s' % ( entry.fields.get('author', 'na'), entry.fields.get('year', 'nd')), description=entry.fields.get('title') or entry.fields.get('booktitle'), bibtex_type=btype) ref = models.Ref(**kw) DBSession.add(ref) DBSession.flush() reflangs = [] no_ca = [{'degruyter'}, {'benjamins'}] provs = set() for key in entry.fields['srctrickle'].split(','): key = key.strip() if key: if key in lgsources: reflangs.extend(lgsources[key]) prov, key = key.split('#', 1) provs.add(prov) DBSession.add(models.Refprovider( provider_pk=data['Provider'][prov].pk, ref_pk=ref.pk, id='{0}:{1}'.format(prov, key))) langs, trigger = entry.languoids(lgcodes) if trigger and ((provs in no_ca) or (reflangs)): # Discard computerized assigned languoids for bibs where this does not make sense, # or for bib entries that have been manually assigned in a Languoid's ini file. langs, trigger = [], None for lid in set(reflangs + langs): DBSession.add( common.LanguageSource(language_pk=data['Languoid'][lid].pk, source_pk=ref.pk)) if trigger: ref.ca_language_trigger = trigger doctypes, trigger = entry.doctypes(data['Doctype']) if trigger is None or provs not in no_ca: for dt in set(doctypes): DBSession.add(models.Refdoctype(doctype_pk=dt.pk, ref_pk=ref.pk)) if trigger: ref.ca_doctype_trigger = trigger return ref
def prime(args): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodically whenever data has been updated. """ recreate_treeclosure() for lpk, mas in DBSession.execute("""\ select l.pk, array_agg(distinct lma.macroarea_pk) from language as l, treeclosuretable as t, languoidmacroarea as lma, macroarea as ma where l.pk = t.parent_pk and t.child_pk = lma.languoid_pk and lma.macroarea_pk = ma.pk and l.pk not in (select languoid_pk from languoidmacroarea) group by l.pk"""): for mapk in mas: DBSession.add(models.Languoidmacroarea(languoid_pk=lpk, macroarea_pk=mapk)) for row in list(DBSession.execute( "select pk, pages, pages_int, startpage_int from source where pages_int < 0" )): pk, pages, number, start = row _start, _end, _number = compute_pages(pages) if _number > 0 and _number != number: DBSession.execute( "update source set pages_int = %s, startpage_int = %s where pk = %s" % (_number, _start, pk)) DBSession.execute( "update ref set endpage_int = %s where pk = %s" % (_end, pk)) with jsonlib.update(gc2version(args), indent=4) as legacy: for lang in DBSession.query(common.Language): if lang.id not in legacy: lang.update_jsondata(new=True) legacy[lang.id] = args.args[0] def items(s): if not s: return set() r = [] for ss in set(s.strip().split()): if '**:' in ss: ss = ss.split('**:')[0] + '**' if ss.endswith(','): ss = ss[:-1].strip() r.append(ss) return set(r) refs = { r[0]: r[1] for r in DBSession.query(models.Refprovider.id, models.Refprovider.ref_pk)} valuesets = { r[0]: r[1] for r in DBSession.query(common.ValueSet.id, common.ValueSet.pk)} for lang in args.repos.languoids(): if lang.category == models.BOOKKEEPING: continue clf = lang.classification_comment if clf: if clf.subrefs: if items(lang.cfg['classification']['subrefs']) != \ items(lang.cfg['classification'].get('sub')): vspk = valuesets['sc-{0}'.format(lang.id)] for ref in clf.subrefs: spk = refs.get(ref.key) DBSession.add( common.ValueSetReference(source_pk=spk, valueset_pk=vspk))
def main(args): # pragma: no cover stats = Counter(new=0, updated=0, skipped=0) changes = {} with transaction.manager: update_providers(args) DBSession.flush() provider_map = get_map(Provider) macroarea_map = get_map(Macroarea) doctype_map = get_map(Doctype) languoid_map = {} for l in DBSession.query(Languoid): if l.hid: languoid_map[l.hid] = l languoid_map[l.id] = l for i, rec in enumerate(get_bib(args)): if i and i % 1000 == 0: print i, 'records done', stats['updated'] + stats['new'], 'changed' if len(rec.keys()) < 6: # not enough information! stats.update(['skipped']) continue changed = False assert rec.get('glottolog_ref_id') id_ = int(rec.get('glottolog_ref_id')) ref = DBSession.query(Source).get(id_) update = True if ref else False kw = { 'pk': id_, 'bibtex_type': rec.genre, 'id': str(id_), 'jsondata': {'bibtexkey': rec.id}, } for source, target in FIELD_MAP.items(): if target is None: continue value = rec.get(source) if value: value = unescape(value) if target: kw[target] = CONVERTER.get(source, lambda x: x)(value) else: kw['jsondata'][source] = value if kw['jsondata'].get('hhtype'): trigger = ca_trigger(kw['jsondata']['hhtype']) if trigger: kw['ca_doctype_trigger'], kw['jsondata']['hhtype'] = trigger # try to extract numeric year, startpage, endpage, numberofpages, ... if kw.get('year'): # prefer years in brackets over the first 4-digit number. match = PREF_YEAR_PATTERN.search(kw.get('year')) if match: kw['year_int'] = int(match.group('year')) else: match = YEAR_PATTERN.search(kw.get('year')) if match: kw['year_int'] = int(match.group('year')) if kw.get('publisher'): p = kw.get('publisher') if ':' in p: address, publisher = [s.strip() for s in kw['publisher'].split(':', 1)] if 'address' not in kw or kw['address'] == address: kw['address'], kw['publisher'] = address, publisher if rec.get('numberofpages'): try: kw['pages_int'] = int(rec.get('numberofpages').strip()) except ValueError: pass if kw.get('pages'): start, end, number = compute_pages(kw['pages']) if start is not None: kw['startpage_int'] = start if end is not None: kw['endpage_int'] = end if number is not None and 'pages_int' not in kw: kw['pages_int'] = number for k in kw.keys(): v = kw[k] if isinstance(v, basestring): v = v.strip() or None kw[k] = v if update: for k in kw.keys(): if k == 'pk': continue v = getattr(ref, k) if kw[k] != v: if k == 'jsondata': d = {k: v for k, v in ref.jsondata.items() if k in NONREF_JSONDATA} d.update(**kw[k]) ref.jsondata = d else: #print k, '--', v #print k, '++', kw[k] setattr(ref, k, kw[k]) changed = True if ref.id in changes: changes[ref.id][k] = ('%s' % v, '%s' % kw[k]) else: changes[ref.id] = {k: ('%s' % v, '%s' % kw[k])} else: changed = True ref = Ref(name='%s %s' % (kw.get('author', 'na'), kw.get('year', 'nd')), **kw) ref.description = ref.title or ref.booktitle originator = ref.author or ref.editor or 'Anonymous' ref.name = '%s %s' % (originator, ref.year or 'n.d.') a, r = update_relationship( ref.macroareas, [macroarea_map[name] for name in set(filter(None, [s.strip() for s in kw['jsondata'].get('macro_area', '').split(',')]))]) changed = changed or a or r src = [s.strip() for s in kw['jsondata'].get('src', '').split(',')] prv = {provider_map[slug(s)] for s in src if s} if set(ref.providers) != prv: ref.providers = list(prv) changed = True a, r = update_relationship( ref.doctypes, [doctype_map[m.group('name')] for m in DOCTYPE_PATTERN.finditer(kw['jsondata'].get('hhtype', ''))]) changed = changed or a or r if not update: stats.update(['new']) DBSession.add(ref) elif changed: stats.update(['updated']) args.log.info('%s' % stats) DBSession.execute("update source set description = title where description is null and title is not null;") DBSession.execute("update source set description = booktitle where description is null and booktitle is not null;") for row in list(DBSession.execute( "select pk, pages, pages_int, startpage_int from source where pages_int < 0")): pk, pages, number, start = row _start, _end, _number = compute_pages(pages) if _number > 0 and _number != number: DBSession.execute( "update source set pages_int = %s, startpage_int = %s where pk = %s" % (_number, _start, pk)) DBSession.execute( "update ref set endpage_int = %s where pk = %s" % (_end, pk)) jsondump(changes, args.data_dir.joinpath('references', 'changes.json'))
def load_ref(data, entry, lgcodes, lgsources): kw = {'jsondata': {}, 'language_note': entry.fields.get('lgcode')} for col in common.Source.__table__.columns: if col.name in entry.fields: kw[col.name] = entry.fields.get(col.name) for col in models.Ref.__table__.columns: if col.name in entry.fields: kw[col.name] = entry.fields.get(col.name) for col in entry.fields: if col not in kw: kw['jsondata'][col] = entry.fields[col] try: btype = EntryType.from_string(entry.type.lower()) except ValueError: btype = EntryType.misc # try to extract numeric year, startpage, endpage, numberofpages, ... if kw.get('year'): # prefer years in brackets over the first 4-digit number. match = PREF_YEAR_PATTERN.search(kw.get('year')) if match: kw['year_int'] = int(match.group('year')) else: match = YEAR_PATTERN.search(kw.get('year')) if match: kw['year_int'] = int(match.group('year')) if kw.get('publisher'): p = kw.get('publisher') if ':' in p: address, publisher = [s.strip() for s in kw['publisher'].split(':', 1)] if 'address' not in kw or kw['address'] == address: kw['address'], kw['publisher'] = address, publisher if kw.get('numberofpages'): try: pages = int(kw.get('numberofpages').strip()) if pages < MAX_PAGE: kw['pages_int'] = pages except ValueError: pass if kw.get('pages'): start, end, number = compute_pages(kw['pages']) if start is not None: kw['startpage_int'] = start if end is not None: kw['endpage_int'] = end if number is not None and 'pages_int' not in kw: kw['pages_int'] = number kw.update( id=entry.fields['glottolog_ref_id'], fts=fts.tsvector( '\n'.join(v for k, v in entry.fields.items() if k != 'abstract')), name='%s %s' % ( entry.fields.get('author', 'na'), entry.fields.get('year', 'nd')), description=entry.fields.get('title') or entry.fields.get('booktitle'), bibtex_type=btype) ref = models.Ref(**kw) DBSession.add(ref) DBSession.flush() reflangs = [] no_ca = [{'degruyter'}, {'benjamins'}] provs = set() for key in entry.fields['srctrickle'].split(','): key = key.strip() if key: if key in lgsources: reflangs.extend(lgsources[key]) prov, key = key.split('#', 1) provs.add(prov) DBSession.add(models.Refprovider( provider_pk=data['Provider'][prov].pk, ref_pk=ref.pk, id='{0}:{1}'.format(prov, key))) langs, trigger = entry.languoids(lgcodes) if trigger and ((provs in no_ca) or (reflangs)): # Discard computerized assigned languoids for bibs where this does not make sense, # or for bib entries that have been manually assigned in a Languoid's ini file. langs, trigger = [], None for lid in set(reflangs + langs): DBSession.add( common.LanguageSource(language_pk=data['Languoid'][lid].pk, source_pk=ref.pk)) if trigger: ref.ca_language_trigger = trigger doctypes, trigger = entry.doctypes(data['Doctype']) if trigger is None or provs not in no_ca: for dt in set(doctypes): DBSession.add(models.Refdoctype(doctype_pk=dt.pk, ref_pk=ref.pk)) if trigger: ref.ca_doctype_trigger = trigger return ref
def prime(args): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodically whenever data has been updated. """ recreate_treeclosure() for lpk, mas in DBSession.execute("""\ select l.pk, array_agg(distinct lma.macroarea_pk) from language as l, treeclosuretable as t, languoidmacroarea as lma, macroarea as ma where l.pk = t.parent_pk and t.child_pk = lma.languoid_pk and lma.macroarea_pk = ma.pk and l.pk not in (select languoid_pk from languoidmacroarea) group by l.pk"""): for mapk in mas: DBSession.add(models.Languoidmacroarea(languoid_pk=lpk, macroarea_pk=mapk)) for row in list(DBSession.execute( "select pk, pages, pages_int, startpage_int from source where pages_int < 0" )): pk, pages, number, start = row _start, _end, _number = compute_pages(pages) if _number > 0 and _number != number: DBSession.execute( "update source set pages_int = %s, startpage_int = %s where pk = %s" % (_number, _start, pk)) DBSession.execute( "update ref set endpage_int = %s where pk = %s" % (_end, pk)) version = assert_release(args.repos.repos) with jsonlib.update(gc2version(args), indent=4) as legacy: for lang in DBSession.query(common.Language): if lang.id not in legacy: lang.update_jsondata(new=True) legacy[lang.id] = version def items(s): if not s: return set() r = [] for ss in set(s.strip().split()): if '**:' in ss: ss = ss.split('**:')[0] + '**' if ss.endswith(','): ss = ss[:-1].strip() r.append(ss) return set(r) refs = { r[0]: r[1] for r in DBSession.query(models.Refprovider.id, models.Refprovider.ref_pk)} valuesets = { r[0]: r[1] for r in DBSession.query(common.ValueSet.id, common.ValueSet.pk)} for lang in args.repos.languoids(): if lang.category == models.BOOKKEEPING: continue clf = lang.classification_comment if clf: if clf.subrefs: if items(lang.cfg['classification']['subrefs']) != \ items(lang.cfg['classification'].get('sub')): vspk = valuesets['sc-{0}'.format(lang.id)] for ref in clf.subrefs: spk = refs.get(ref.key) DBSession.add( common.ValueSetReference(source_pk=spk, valueset_pk=vspk)) if clf.familyrefs: if items(lang.cfg['classification']['familyrefs']) != \ items(lang.cfg['classification'].get('family')): vspk = valuesets['fc-{0}'.format(lang.id)] for ref in clf.familyrefs: spk = refs.get(ref.key) if spk: DBSession.add( common.ValueSetReference(source_pk=spk, valueset_pk=vspk))