Example #1
0
def main(args):  # pragma: no cover
    stats = Counter(new=0, updated=0, skipped=0)
    changes = {}

    with transaction.manager:
        update_providers(args)
        DBSession.flush()
        provider_map = get_map(Provider)
        macroarea_map = get_map(Macroarea)
        doctype_map = get_map(Doctype)

        languoid_map = {}
        for l in DBSession.query(Languoid):
            if l.hid:
                languoid_map[l.hid] = l
            languoid_map[l.id] = l

        for i, rec in enumerate(get_bib(args)):
            if i and i % 1000 == 0:
                print i, 'records done', stats['updated'] + stats['new'], 'changed'

            if len(rec.keys()) < 6:
                # not enough information!
                stats.update(['skipped'])
                continue

            changed = False
            assert rec.get('glottolog_ref_id')
            id_ = int(rec.get('glottolog_ref_id'))

            ref = DBSession.query(Source).get(id_)
            update = True if ref else False

            kw = {
                'pk': id_,
                'bibtex_type': rec.genre,
                'id': str(id_),
                'jsondata': {'bibtexkey': rec.id},
            }

            for source, target in FIELD_MAP.items():
                if target is None:
                    continue
                value = rec.get(source)
                if value:
                    value = unescape(value)
                    if target:
                        kw[target] = CONVERTER.get(source, lambda x: x)(value)
                    else:
                        kw['jsondata'][source] = value

            if kw['jsondata'].get('hhtype'):
                trigger = ca_trigger(kw['jsondata']['hhtype'])
                if trigger:
                    kw['ca_doctype_trigger'], kw['jsondata']['hhtype'] = trigger

            # try to extract numeric year, startpage, endpage, numberofpages, ...
            if kw.get('year'):
                # prefer years in brackets over the first 4-digit number.
                match = PREF_YEAR_PATTERN.search(kw.get('year'))
                if match:
                    kw['year_int'] = int(match.group('year'))
                else:
                    match = YEAR_PATTERN.search(kw.get('year'))
                    if match:
                        kw['year_int'] = int(match.group('year'))

            if kw.get('publisher'):
                p = kw.get('publisher')
                if ':' in p:
                    address, publisher = [s.strip() for s in kw['publisher'].split(':', 1)]
                    if 'address' not in kw or kw['address'] == address:
                        kw['address'], kw['publisher'] = address, publisher

            if rec.get('numberofpages'):
                try:
                    kw['pages_int'] = int(rec.get('numberofpages').strip())
                except ValueError:
                    pass

            if kw.get('pages'):
                start, end, number = compute_pages(kw['pages'])
                if start is not None:
                    kw['startpage_int'] = start
                if end is not None:
                    kw['endpage_int'] = end
                if number is not None and 'pages_int' not in kw:
                    kw['pages_int'] = number

            for k in kw.keys():
                v = kw[k]
                if isinstance(v, basestring):
                    v = v.strip() or None
                kw[k] = v

            if update:
                for k in kw.keys():
                    if k == 'pk':
                        continue
                    v = getattr(ref, k)
                    if kw[k] != v:
                        if k == 'jsondata':
                            d = {k: v for k, v in ref.jsondata.items()
                                 if k in NONREF_JSONDATA}
                            d.update(**kw[k])
                            ref.jsondata = d
                        else:
                            #print k, '--', v
                            #print k, '++', kw[k]
                            setattr(ref, k, kw[k])
                            changed = True
                            if ref.id in changes:
                                changes[ref.id][k] = ('%s' % v, '%s' % kw[k])
                            else:
                                changes[ref.id] = {k: ('%s' % v, '%s' % kw[k])}
            else:
                changed = True
                ref = Ref(name='%s %s' % (kw.get('author', 'na'), kw.get('year', 'nd')), **kw)

            ref.description = ref.title or ref.booktitle
            originator = ref.author or ref.editor or 'Anonymous'
            ref.name = '%s %s' % (originator, ref.year or 'n.d.')

            a, r = update_relationship(
                ref.macroareas,
                [macroarea_map[name] for name in
                 set(filter(None, [s.strip() for s in kw['jsondata'].get('macro_area', '').split(',')]))])
            changed = changed or a or r

            src = [s.strip() for s in kw['jsondata'].get('src', '').split(',')]
            prv = {provider_map[slug(s)] for s in src if s}
            if set(ref.providers) != prv:
                ref.providers = list(prv)
                changed = True

            a, r = update_relationship(
                ref.doctypes,
                [doctype_map[m.group('name')] for m in
                 DOCTYPE_PATTERN.finditer(kw['jsondata'].get('hhtype', ''))])
            changed = changed or a or r

            if not update:
                stats.update(['new'])
                DBSession.add(ref)
            elif changed:
                stats.update(['updated'])

    args.log.info('%s' % stats)

    DBSession.execute("update source set description = title where description is null and title is not null;")
    DBSession.execute("update source set description = booktitle where description is null and booktitle is not null;")

    for row in list(DBSession.execute(
            "select pk, pages, pages_int, startpage_int from source where pages_int < 0")):
        pk, pages, number, start = row
        _start, _end, _number = compute_pages(pages)
        if _number > 0 and _number != number:
            DBSession.execute(
                "update source set pages_int = %s, startpage_int = %s where pk = %s" %
                (_number, _start, pk))
            DBSession.execute(
                "update ref set endpage_int = %s where pk = %s" %
                (_end, pk))

    jsondump(changes, args.data_dir.joinpath('references', 'changes.json'))
Example #2
0
def main(args):  # pragma: no cover
    bib = Database.from_file(args.data_file(args.version, 'refs.bib'), encoding='utf8')
    count = 0
    skipped = 0
    changes = {}

    with transaction.manager:
        update_providers(args)
        DBSession.flush()
        provider_map = get_map(Provider)
        macroarea_map = get_map(Macroarea)
        doctype_map = get_map(Doctype)

        known_ids = set(r[0] for r in DBSession.query(Ref.pk))

        languoid_map = {}
        for l in DBSession.query(Languoid):
            if l.hid:
                languoid_map[l.hid] = l
            languoid_map[l.id] = l

        for i, rec in enumerate(bib):
            if i and i % 1000 == 0:
                print i, 'records done', count, 'changed'

            if len(rec.keys()) < 6:
                # not enough information!
                skipped += 1
                continue

            changed = False
            assert rec.get('glottolog_ref_id')
            id_ = int(rec.get('glottolog_ref_id'))
            if args.mode != 'update' and id_ in known_ids:
                continue
            ref = DBSession.query(Source).get(id_)
            update = True if ref else False

            kw = {
                'pk': id_,
                'bibtex_type': rec.genre,
                'id': str(id_),
                'jsondata': {'bibtexkey': rec.id},
            }

            for source, target in FIELD_MAP.items():
                if target is None:
                    continue
                value = rec.get(source)
                if value:
                    value = unescape(value)
                    if target:
                        kw[target] = CONVERTER.get(source, lambda x: x)(value)
                    else:
                        kw['jsondata'][source] = value

            if kw['jsondata'].get('hhtype'):
                trigger = ca_trigger(kw['jsondata']['hhtype'])
                if trigger:
                    kw['ca_doctype_trigger'], kw['jsondata']['hhtype'] = trigger

            # try to extract numeric year, startpage, endpage, numberofpages, ...
            if kw.get('year'):
                # prefer years in brackets over the first 4-digit number.
                match = PREF_YEAR_PATTERN.search(kw.get('year'))
                if match:
                    kw['year_int'] = int(match.group('year'))
                else:
                    match = YEAR_PATTERN.search(kw.get('year'))
                    if match:
                        kw['year_int'] = int(match.group('year'))

            if kw.get('publisher'):
                p = kw.get('publisher')
                if ':' in p:
                    address, publisher = [s.strip() for s in kw['publisher'].split(':', 1)]
                    if not 'address' in kw or kw['address'] == address:
                        kw['address'], kw['publisher'] = address, publisher

            if rec.get('numberofpages'):
                try:
                    kw['pages_int'] = int(rec.get('numberofpages').strip())
                except ValueError:
                    pass

            if kw.get('pages'):
                start, end, number = compute_pages(kw['pages'])
                if start is not None:
                    kw['startpage_int'] = start
                if end is not None:
                    kw['endpage_int'] = end
                if number is not None and 'pages_int' not in kw:
                    kw['pages_int'] = number

            if update:
                for k in kw.keys():
                    if k == 'pk':
                        continue
                    v = getattr(ref, k)
                    if kw[k] != v:
                        if k == 'jsondata':
                            d = ref.jsondata or {}
                            d.update(**kw[k])
                            for s, t in FIELD_MAP.items():
                                if t is None and s in d:
                                    del d[s]
                            ref.jsondata = d
                        else:
                            print k, '--', v
                            print k, '++', kw[k]
                            setattr(ref, k, kw[k])
                            changed = True
                            if ref.id in changes:
                                changes[ref.id][k] = ('%s' % v, '%s' % kw[k])
                            else:
                                changes[ref.id] = {k: ('%s' % v, '%s' % kw[k])}
            else:
                changed = True
                ref = Ref(name='%s %s' % (kw.get('author', 'na'), kw.get('year', 'nd')), **kw)

            ref.description = ref.title or ref.booktitle
            ref.name = '%s %s' % (ref.author or 'n.a.', ref.year or 'n.d.')

            def append(attr, obj):
                if obj and obj not in attr:
                    attr.append(obj)
                    return True

            a, r = update_relationship(
                ref.macroareas,
                [macroarea_map[name] for name in
                 set(filter(None, [s.strip() for s in kw['jsondata'].get('macro_area', '').split(',')]))])
            changed = changed or a or r

            for name in set(filter(None, [s.strip() for s in kw['jsondata'].get('src', '').split(',')])):
                result = append(ref.providers, provider_map[slug(name)])
                changed = changed or result

            a, r = update_relationship(
                ref.doctypes,
                [doctype_map[m.group('name')] for m in
                 DOCTYPE_PATTERN.finditer(kw['jsondata'].get('hhtype', ''))])
            changed = changed or a or r

            if not update:
                DBSession.add(ref)

            if changed:
                count += 1
                ref.doctypes_str = ', '.join(o.id for o in ref.doctypes)
                ref.providers_str = ', '.join(o.id for o in ref.providers)

        print count, 'records updated or imported'
        print skipped, 'records skipped because of lack of information'

    DBSession.execute("update source set description = title where description is null and title is not null;")
    DBSession.execute("update source set description = booktitle where description is null and booktitle is not null;")

    for row in list(DBSession.execute(
            "select pk, pages, pages_int, startpage_int from source where pages_int < 0")):
        pk, pages, number, start = row
        _start, _end, _number = compute_pages(pages)
        if _number > 0 and _number != number:
            DBSession.execute(
                "update source set pages_int = %s, startpage_int = %s where pk = %s",
                (_number, _start, pk))
            DBSession.execute(
                "update ref set endpage_int = %s where pk = %s",
                (_end, pk))

    return changes
Example #3
0
# -*- coding: utf-8 -*-
import transaction
from clld.scripts.util import parsed_args

from glottolog3.scripts.util import update_providers


if __name__ == "__main__":  # pragma: no cover
    with transaction.manager:
        update_providers(parsed_args((("--version",), dict(default=""))))