Ejemplo n.º 1
0
def macroareas(args, languages, stats):
    ma_map = get_map(Macroarea)

    # we store references to languages to make computation of cumulated macroareas for
    # families easier
    lang_map = {}
    for hid, info in get_lginfo(args, lambda x: x.macro_area):
        if hid not in languages:
            languages[hid] = Languoid.get(hid, key='hid', default=None)
        if not languages[hid]:
            continue
        lang_map[languages[hid].pk] = languages[hid]
        a, r = update_relationship(languages[hid].macroareas, [ma_map[info.macro_area]])
        if a or r:
            stats.update(['macroarea'])

    for family in DBSession.query(Languoid)\
            .filter(Languoid.level == LanguoidLevel.family)\
            .filter(Language.active == true()):
        mas = []
        for lang in DBSession.query(TreeClosureTable.child_pk)\
                .filter(TreeClosureTable.parent_pk == family.pk):
            if lang[0] in lang_map:
                mas.extend(lang_map[lang[0]].macroareas)
        a, r = update_relationship(family.macroareas, mas)
        if a or r:
            stats.update(['macroarea'])
    args.log.info('macroareas done')
Ejemplo n.º 2
0
def update_providers(args, verbose=False):
    filepath = args.data_dir.joinpath('references', 'bibtex', 'BIBFILES.ini')
    p = RawConfigParser()
    with io.open(filepath, encoding='utf-8-sig') as fp:
        p.readfp(fp)

    provider_map = get_map(Provider)
    for section in p.sections():
        sectname = section[:-4] if section.endswith('.bib') else section
        id_ = slug(sectname)
        attrs = {
            'name': p.get(section, 'title'),
            'description': p.get(section, 'description'),
            'abbr': p.get(section, 'abbr'),
        }
        if id_ in provider_map:
            provider = provider_map[id_]
            for a in list(attrs):
                before, after = getattr(provider, a), attrs[a]
                if before == after:
                    del attrs[a]
                else:
                    setattr(provider, a, after)
                    attrs[a] = (before, after)
            if attrs:
                args.log.info('updating provider %s %s' % (slug(id_), sorted(attrs)))
            if verbose:
                for a, (before, after) in attrs.items():
                    before, after = (' '.join(_.split()) for _ in (before, after))
                    if before != after:
                        args.log.info('%s\n%r\n%r' % (a, before, after))
        else:
            args.log.info('adding provider %s' % slug(id_))
            DBSession.add(Provider(id=id_, **attrs))
Ejemplo n.º 3
0
def update_providers(args):
    if not args.data_file(args.version, 'provider.txt').exists():
        return

    with open(args.data_file(args.version, 'provider.txt')) as fp:
        content = fp.read().decode('latin1')

    if '\r\n' in content:
        content = content.replace('\r\n', '\n')

    provider_map = get_map(Provider)
    for block in content.split('\n\n\n\n'):
        lines = block.split('\n')
        id_, abbr = lines[0].strip().split(':')
        id_ = id_.split('.')[0]
        description = unescape('\n'.join(lines[1:]))
        name = description.split('.')[0]

        if id_ == 'hedvig-tirailleur':
            id_ = u'skirgard'

        if slug(id_) not in provider_map:
            args.log.info('adding provider %s' % slug(id_))
            DBSession.add(
                Provider(id=slug(id_), name=name, description=description, abbr=abbr))
Ejemplo n.º 4
0
def macroareas(args, languages):
    ma_map = get_map(Macroarea)

    # we store references to languages to make computation of cumulated macroareas for
    # families easier
    lang_map = {}

    for hid, macroarea in dsv.reader(args.data_file("macroareas.tab")):
        if hid not in languages:
            languages[hid] = Languoid.get(hid, key="hid", default=None)
        if not languages[hid]:
            continue
        lang_map[languages[hid].pk] = languages[hid]
        update_relationship(languages[hid].macroareas, [ma_map[macroarea]], log=args.log)

    for family in (
        DBSession.query(Languoid).filter(Languoid.level == LanguoidLevel.family).filter(Language.active == True)
    ):
        mas = []
        for lang in DBSession.query(TreeClosureTable.child_pk).filter(TreeClosureTable.parent_pk == family.pk):
            if lang[0] in lang_map:
                mas.extend(lang_map[lang[0]].macroareas)
        update_relationship(family.macroareas, mas, log=args.log)
    print "macroareas done"
Ejemplo n.º 5
0
def update_providers(args, verbose=False):
    filepath = args.data_dir.joinpath('references', 'bibtex', 'BIBFILES.ini')
    p = RawConfigParser()
    with io.open(filepath, encoding='utf-8-sig') as fp:
        p.readfp(fp)

    provider_map = get_map(Provider)
    for section in p.sections():
        sectname = section[:-4] if section.endswith('.bib') else section
        id_ = slug(sectname)
        attrs = {
            'name': p.get(section, 'title'),
            'description': p.get(section, 'description'),
            'abbr': p.get(section, 'abbr'),
        }
        if id_ in provider_map:
            provider = provider_map[id_]
            for a in list(attrs):
                before, after = getattr(provider, a), attrs[a]
                if before == after:
                    del attrs[a]
                else:
                    setattr(provider, a, after)
                    attrs[a] = (before, after)
            if attrs:
                args.log.info('updating provider %s %s' %
                              (slug(id_), sorted(attrs)))
            if verbose:
                for a, (before, after) in attrs.items():
                    before, after = (' '.join(_.split())
                                     for _ in (before, after))
                    if before != after:
                        args.log.info('%s\n%r\n%r' % (a, before, after))
        else:
            args.log.info('adding provider %s' % slug(id_))
            DBSession.add(Provider(id=id_, **attrs))
Ejemplo n.º 6
0
def main(args):  # pragma: no cover
    with transaction.manager:
        max_identifier_pk = DBSession.query(
            Identifier.pk).order_by(desc(Identifier.pk)).first()[0]
        ma_map = get_map(Macroarea)
        languoids = dict((l.pk, l) for l in DBSession.query(Languoid))
        with open(args.data_file('languoids.json')) as fp:
            for attrs in json.load(fp):
                ma = attrs.pop('macroarea', None)
                replacement = attrs.pop('replacement', None)
                hname = attrs.pop('hname', None)

                l = languoids.get(attrs['pk'])
                if l:
                    for k, v in attrs.items():
                        if k == 'globalclassificationcomment':
                            continue
                        cv = getattr(l, k)
                        if isinstance(cv, EnumSymbol):
                            cv = cv.value
                        assert v == cv
                        #setattr(l, k, v)
                    if len(l.hid or '') == 3:
                        assert l.iso_code
                        #if not l.iso_code:
                        #    l.identifiers.append(
                        #        Identifier(
                        #            id=str(max_identifier_pk + 1),
                        #            name=l.hid,
                        #            type=IdentifierType.iso.value))
                        #    max_identifier_pk += 1
                else:
                    raise ValueError()
                    try:
                        l = Languoid(**attrs)
                    except Exception:
                        print attrs
                        raise
                    DBSession.add(l)
                    languoids[l.pk] = l

                    if len(attrs.get('hid', '')) == 3:
                        l.identifiers.append(
                            Identifier(
                                id=str(max_identifier_pk + 1),
                                name=attrs['hid'],
                                type=IdentifierType.iso.value))
                        max_identifier_pk += 1
                    if ma:
                        l.macroareas.append(ma_map[ma])

                    l.identifiers.append(
                        Identifier(
                            id=str(max_identifier_pk + 1),
                            name=l.name,
                            description='Glottolog',
                            type='name'))
                    max_identifier_pk += 1

                if hname:
                    assert l.jsondata['hname'] == hname
                    #l.hname = hname

                if replacement:
                    raise ValueError()
                    DBSession.add(Superseded(
                        languoid_pk=l.pk,
                        replacement_pk=replacement,
                        relation='classification update'))
Ejemplo n.º 7
0
def main(args):  # pragma: no cover
    global MAX_IDENTIFIER_PK

    with transaction.manager:
        MAX_IDENTIFIER_PK = DBSession.query(
            Identifier.pk).order_by(desc(Identifier.pk)).first()[0]

        gc_names = {i.name: i for i in DBSession.query(Identifier).filter(
            Identifier.type == 'name').filter(Identifier.description == 'Glottolog')}

        ma_map = get_map(Macroarea)
        languoids = dict((l.pk, l) for l in DBSession.query(Languoid))
        with open(args.data_file(args.version, 'languoids.json')) as fp:
            for attrs in json.load(fp):
                ma = attrs.pop('macroarea', None)
                replacement = attrs.pop('replacement', None)
                hname = attrs.pop('hname', None)

                for name, enum in [('level', LanguoidLevel), ('status', LanguoidStatus)]:
                    if name in attrs:
                        attrs[name] = enum.from_string(attrs[name])

                l = languoids.get(attrs['pk'])
                if l:
                    for k, v in attrs.items():
                        if k == 'globalclassificationcomment':
                            continue
                        setattr(l, k, v)
                    if len(l.hid or '') == 3:
                        if not l.iso_code:
                            create_identifier(
                                None, l, name=l.hid, type=IdentifierType.iso.value)
                else:
                    l = Languoid(**attrs)
                    DBSession.add(l)
                    languoids[l.pk] = l

                    if len(attrs.get('hid', '')) == 3:
                        create_identifier(
                            None, l, name=attrs['hid'], type=IdentifierType.iso.value)
                    if ma:
                        l.macroareas.append(ma_map[ma])

                    create_identifier(
                        gc_names.get(l.name),
                        l,
                        name=l.name,
                        description='Glottolog',
                        type='name')

                if hname:
                    l.update_jsondata(hname=hname)

                if replacement:
                    DBSession.add(Superseded(
                        languoid_pk=l.pk,
                        replacement_pk=replacement,
                        relation='classification update'))

                DBSession.flush()

        recreate_treeclosure()
Ejemplo n.º 8
0
def main(args):  # pragma: no cover
    bib = Database.from_file(args.data_file(args.version, 'refs.bib'), encoding='utf8')
    count = 0
    skipped = 0
    changes = {}

    with transaction.manager:
        update_providers(args)
        DBSession.flush()
        provider_map = get_map(Provider)
        macroarea_map = get_map(Macroarea)
        doctype_map = get_map(Doctype)

        known_ids = set(r[0] for r in DBSession.query(Ref.pk))

        languoid_map = {}
        for l in DBSession.query(Languoid):
            if l.hid:
                languoid_map[l.hid] = l
            languoid_map[l.id] = l

        for i, rec in enumerate(bib):
            if i and i % 1000 == 0:
                print i, 'records done', count, 'changed'

            if len(rec.keys()) < 6:
                # not enough information!
                skipped += 1
                continue

            changed = False
            assert rec.get('glottolog_ref_id')
            id_ = int(rec.get('glottolog_ref_id'))
            if args.mode != 'update' and id_ in known_ids:
                continue
            ref = DBSession.query(Source).get(id_)
            update = True if ref else False

            kw = {
                'pk': id_,
                'bibtex_type': rec.genre,
                'id': str(id_),
                'jsondata': {'bibtexkey': rec.id},
            }

            for source, target in FIELD_MAP.items():
                if target is None:
                    continue
                value = rec.get(source)
                if value:
                    value = unescape(value)
                    if target:
                        kw[target] = CONVERTER.get(source, lambda x: x)(value)
                    else:
                        kw['jsondata'][source] = value

            if kw['jsondata'].get('hhtype'):
                trigger = ca_trigger(kw['jsondata']['hhtype'])
                if trigger:
                    kw['ca_doctype_trigger'], kw['jsondata']['hhtype'] = trigger

            # try to extract numeric year, startpage, endpage, numberofpages, ...
            if kw.get('year'):
                # prefer years in brackets over the first 4-digit number.
                match = PREF_YEAR_PATTERN.search(kw.get('year'))
                if match:
                    kw['year_int'] = int(match.group('year'))
                else:
                    match = YEAR_PATTERN.search(kw.get('year'))
                    if match:
                        kw['year_int'] = int(match.group('year'))

            if kw.get('publisher'):
                p = kw.get('publisher')
                if ':' in p:
                    address, publisher = [s.strip() for s in kw['publisher'].split(':', 1)]
                    if not 'address' in kw or kw['address'] == address:
                        kw['address'], kw['publisher'] = address, publisher

            if rec.get('numberofpages'):
                try:
                    kw['pages_int'] = int(rec.get('numberofpages').strip())
                except ValueError:
                    pass

            if kw.get('pages'):
                start, end, number = compute_pages(kw['pages'])
                if start is not None:
                    kw['startpage_int'] = start
                if end is not None:
                    kw['endpage_int'] = end
                if number is not None and 'pages_int' not in kw:
                    kw['pages_int'] = number

            if update:
                for k in kw.keys():
                    if k == 'pk':
                        continue
                    v = getattr(ref, k)
                    if kw[k] != v:
                        if k == 'jsondata':
                            d = ref.jsondata or {}
                            d.update(**kw[k])
                            for s, t in FIELD_MAP.items():
                                if t is None and s in d:
                                    del d[s]
                            ref.jsondata = d
                        else:
                            print k, '--', v
                            print k, '++', kw[k]
                            setattr(ref, k, kw[k])
                            changed = True
                            if ref.id in changes:
                                changes[ref.id][k] = ('%s' % v, '%s' % kw[k])
                            else:
                                changes[ref.id] = {k: ('%s' % v, '%s' % kw[k])}
            else:
                changed = True
                ref = Ref(name='%s %s' % (kw.get('author', 'na'), kw.get('year', 'nd')), **kw)

            ref.description = ref.title or ref.booktitle
            ref.name = '%s %s' % (ref.author or 'n.a.', ref.year or 'n.d.')

            def append(attr, obj):
                if obj and obj not in attr:
                    attr.append(obj)
                    return True

            a, r = update_relationship(
                ref.macroareas,
                [macroarea_map[name] for name in
                 set(filter(None, [s.strip() for s in kw['jsondata'].get('macro_area', '').split(',')]))])
            changed = changed or a or r

            for name in set(filter(None, [s.strip() for s in kw['jsondata'].get('src', '').split(',')])):
                result = append(ref.providers, provider_map[slug(name)])
                changed = changed or result

            a, r = update_relationship(
                ref.doctypes,
                [doctype_map[m.group('name')] for m in
                 DOCTYPE_PATTERN.finditer(kw['jsondata'].get('hhtype', ''))])
            changed = changed or a or r

            if not update:
                DBSession.add(ref)

            if changed:
                count += 1
                ref.doctypes_str = ', '.join(o.id for o in ref.doctypes)
                ref.providers_str = ', '.join(o.id for o in ref.providers)

        print count, 'records updated or imported'
        print skipped, 'records skipped because of lack of information'

    DBSession.execute("update source set description = title where description is null and title is not null;")
    DBSession.execute("update source set description = booktitle where description is null and booktitle is not null;")

    for row in list(DBSession.execute(
            "select pk, pages, pages_int, startpage_int from source where pages_int < 0")):
        pk, pages, number, start = row
        _start, _end, _number = compute_pages(pages)
        if _number > 0 and _number != number:
            DBSession.execute(
                "update source set pages_int = %s, startpage_int = %s where pk = %s",
                (_number, _start, pk))
            DBSession.execute(
                "update ref set endpage_int = %s where pk = %s",
                (_end, pk))

    return changes
Ejemplo n.º 9
0
def main(args):  # pragma: no cover
    with transaction.manager:
        max_identifier_pk = DBSession.query(Identifier.pk).order_by(
            desc(Identifier.pk)).first()[0]
        ma_map = get_map(Macroarea)
        languoids = dict((l.pk, l) for l in DBSession.query(Languoid))
        with open(args.data_file('languoids.json')) as fp:
            for attrs in json.load(fp):
                ma = attrs.pop('macroarea', None)
                replacement = attrs.pop('replacement', None)
                hname = attrs.pop('hname', None)

                l = languoids.get(attrs['pk'])
                if l:
                    for k, v in attrs.items():
                        if k == 'globalclassificationcomment':
                            continue
                        cv = getattr(l, k)
                        if isinstance(cv, EnumSymbol):
                            cv = cv.value
                        assert v == cv
                        #setattr(l, k, v)
                    if len(l.hid or '') == 3:
                        assert l.iso_code
                        #if not l.iso_code:
                        #    l.identifiers.append(
                        #        Identifier(
                        #            id=str(max_identifier_pk + 1),
                        #            name=l.hid,
                        #            type=IdentifierType.iso.value))
                        #    max_identifier_pk += 1
                else:
                    raise ValueError()
                    try:
                        l = Languoid(**attrs)
                    except Exception:
                        print attrs
                        raise
                    DBSession.add(l)
                    languoids[l.pk] = l

                    if len(attrs.get('hid', '')) == 3:
                        l.identifiers.append(
                            Identifier(id=str(max_identifier_pk + 1),
                                       name=attrs['hid'],
                                       type=IdentifierType.iso.value))
                        max_identifier_pk += 1
                    if ma:
                        l.macroareas.append(ma_map[ma])

                    l.identifiers.append(
                        Identifier(id=str(max_identifier_pk + 1),
                                   name=l.name,
                                   description='Glottolog',
                                   type='name'))
                    max_identifier_pk += 1

                if hname:
                    assert l.jsondata['hname'] == hname
                    #l.hname = hname

                if replacement:
                    raise ValueError()
                    DBSession.add(
                        Superseded(languoid_pk=l.pk,
                                   replacement_pk=replacement,
                                   relation='classification update'))
Ejemplo n.º 10
0
def main(bib, mode):  # pragma: no cover
    count = 0
    skipped = 0

    with transaction.manager:
        provider_map = get_map(Provider)
        macroarea_map = get_map(Macroarea)
        doctype_map = get_map(Doctype)

        known_ids = set(r[0] for r in DBSession.query(Ref.pk))

        languoid_map = {}
        for l in DBSession.query(Languoid):
            if l.hid:
                languoid_map[l.hid] = l
            languoid_map[l.id] = l

        for i, rec in enumerate(bib):
            if len(rec.keys()) < 6:
                skipped += 1
                #print '---> skip', rec.id
                #print rec
                continue

            changed = False
            assert rec.get('glottolog_ref_id')
            id_ = int(rec.get('glottolog_ref_id'))
            if mode != 'update' and id_ in known_ids:
                continue
            ref = DBSession.query(Source).get(id_)
            update = True if ref else False

            kw = {
                'pk': id_,
                'bibtex_type': getattr(EntryType, rec.genre),
                'id': str(id_),
                'jsondata': {
                    'bibtexkey': rec.id
                },
            }

            for source, target in FIELD_MAP.items():
                value = rec.get(source)
                if value:
                    value = unescape(value)
                    if target:
                        kw[target] = CONVERTER.get(source, lambda x: x)(value)
                    else:
                        kw['jsondata'][source] = value

            # try to extract numeric year, startpage, endpage, numberofpages, ...
            if rec.get('numberofpages'):
                try:
                    kw['pages_int'] = int(rec.get('numberofpages').strip())
                except ValueError:
                    pass

            if kw.get('year'):
                match = YEAR_PATTERN.search(kw.get('year'))
                if match:
                    kw['year_int'] = int(match.group('year'))

            if kw.get('publisher'):
                p = kw.get('publisher')
                if ':' in p:
                    address, publisher = [
                        s.strip() for s in kw['publisher'].split(':', 1)
                    ]
                    if not 'address' in kw or kw['address'] == address:
                        kw['address'], kw['publisher'] = address, publisher

            if kw.get('pages'):
                pages = kw.get('pages')
                match = ROMANPAGESPATTERNra.search(pages)
                if not match:
                    match = ROMANPAGESPATTERNar.search(pages)
                if match:
                    if 'pages_int' not in kw:
                        kw['pages_int'] = roman_to_int(match.group('roman')) \
                            + int(match.group('arabic'))
                else:
                    start = None
                    number = None
                    match = None

                    for match in PAGES_PATTERN.finditer(pages):
                        if start is None:
                            start = int(match.group('start'))
                        number = (number or 0) \
                            + (int(match.group('end')) - int(match.group('start')) + 1)

                    if match:
                        kw['endpage_int'] = int(match.group('end'))
                        kw['startpage_int'] = start
                        kw.setdefault('pages_int', number)
                    else:
                        try:
                            kw['startpage_int'] = int(pages)
                        except ValueError:
                            pass

            if update:
                for k in kw.keys():
                    if k == 'pk':
                        continue
                    #if k == 'title':
                    #    v = ref.title or ref.description
                    #else:
                    if 1:
                        v = getattr(ref, k)
                    if kw[k] != v:
                        #
                        # TODO!
                        #
                        setattr(ref, k, kw[k])
                        #if k not in ['jsondata', 'publisher']:
                        #    print k, ref.pk
                        #    print kw[k]
                        #    print v
                        #    print '--------------'
                        changed = True
                    if ref.title:
                        ref.description = ref.title
            else:
                changed = True
                ref = Ref(**kw)

            def append(attr, obj):
                if obj and obj not in attr:
                    changed = True
                    #
                    # TODO!
                    #
                    attr.append(obj)

            for name in set(
                    filter(None, [
                        s.strip() for s in kw['jsondata'].get(
                            'macro_area', '').split(',')
                    ])):
                append(ref.macroareas, macroarea_map[name])

            for name in set(
                    filter(None, [
                        s.strip()
                        for s in kw['jsondata'].get('src', '').split(',')
                    ])):
                append(ref.providers, provider_map[slug(name)])

            for m in DOCTYPE_PATTERN.finditer(kw['jsondata'].get('hhtype',
                                                                 '')):
                append(ref.doctypes, doctype_map[m.group('name')])

            if len(kw['jsondata'].get('lgcode', '')) == 3:
                kw['jsondata']['lgcode'] = '[%s]' % kw['jsondata']['lgcode']

            for m in CODE_PATTERN.finditer(kw['jsondata'].get('lgcode', '')):
                for code in set(m.group('code').split(',')):
                    if code not in languoid_map:
                        if code not in ['NOCODE_Payagua', 'emx']:
                            print '--> unknown code:', code.encode('utf8')
                    else:
                        append(ref.languages, languoid_map[code])

            for glottocode in filter(
                    None, kw['jsondata'].get('alnumcodes', '').split(';')):
                if glottocode not in languoid_map:
                    print '--> unknown glottocode:', glottocode.encode('utf8')
                else:
                    append(ref.languages, languoid_map[glottocode])

            if not update:
                #pass
                #
                # TODO!
                #
                DBSession.add(ref)

            if i % 100 == 0:
                print i, 'records done'

            if changed:
                count += 1

        print count, 'records updated or imported'
        print skipped, 'records skipped because of lack of information'
Ejemplo n.º 11
0
def main(args):  # pragma: no cover
    stats = Counter(new=0, updated=0, skipped=0)
    changes = {}

    with transaction.manager:
        update_providers(args)
        DBSession.flush()
        provider_map = get_map(Provider)
        macroarea_map = get_map(Macroarea)
        doctype_map = get_map(Doctype)

        languoid_map = {}
        for l in DBSession.query(Languoid):
            if l.hid:
                languoid_map[l.hid] = l
            languoid_map[l.id] = l

        for i, rec in enumerate(get_bib(args)):
            if i and i % 1000 == 0:
                print i, 'records done', stats['updated'] + stats['new'], 'changed'

            if len(rec.keys()) < 6:
                # not enough information!
                stats.update(['skipped'])
                continue

            changed = False
            assert rec.get('glottolog_ref_id')
            id_ = int(rec.get('glottolog_ref_id'))

            ref = DBSession.query(Source).get(id_)
            update = True if ref else False

            kw = {
                'pk': id_,
                'bibtex_type': rec.genre,
                'id': str(id_),
                'jsondata': {'bibtexkey': rec.id},
            }

            for source, target in FIELD_MAP.items():
                if target is None:
                    continue
                value = rec.get(source)
                if value:
                    value = unescape(value)
                    if target:
                        kw[target] = CONVERTER.get(source, lambda x: x)(value)
                    else:
                        kw['jsondata'][source] = value

            if kw['jsondata'].get('hhtype'):
                trigger = ca_trigger(kw['jsondata']['hhtype'])
                if trigger:
                    kw['ca_doctype_trigger'], kw['jsondata']['hhtype'] = trigger

            # try to extract numeric year, startpage, endpage, numberofpages, ...
            if kw.get('year'):
                # prefer years in brackets over the first 4-digit number.
                match = PREF_YEAR_PATTERN.search(kw.get('year'))
                if match:
                    kw['year_int'] = int(match.group('year'))
                else:
                    match = YEAR_PATTERN.search(kw.get('year'))
                    if match:
                        kw['year_int'] = int(match.group('year'))

            if kw.get('publisher'):
                p = kw.get('publisher')
                if ':' in p:
                    address, publisher = [s.strip() for s in kw['publisher'].split(':', 1)]
                    if 'address' not in kw or kw['address'] == address:
                        kw['address'], kw['publisher'] = address, publisher

            if rec.get('numberofpages'):
                try:
                    kw['pages_int'] = int(rec.get('numberofpages').strip())
                except ValueError:
                    pass

            if kw.get('pages'):
                start, end, number = compute_pages(kw['pages'])
                if start is not None:
                    kw['startpage_int'] = start
                if end is not None:
                    kw['endpage_int'] = end
                if number is not None and 'pages_int' not in kw:
                    kw['pages_int'] = number

            for k in kw.keys():
                v = kw[k]
                if isinstance(v, basestring):
                    v = v.strip() or None
                kw[k] = v

            if update:
                for k in kw.keys():
                    if k == 'pk':
                        continue
                    v = getattr(ref, k)
                    if kw[k] != v:
                        if k == 'jsondata':
                            d = {k: v for k, v in ref.jsondata.items()
                                 if k in NONREF_JSONDATA}
                            d.update(**kw[k])
                            ref.jsondata = d
                        else:
                            #print k, '--', v
                            #print k, '++', kw[k]
                            setattr(ref, k, kw[k])
                            changed = True
                            if ref.id in changes:
                                changes[ref.id][k] = ('%s' % v, '%s' % kw[k])
                            else:
                                changes[ref.id] = {k: ('%s' % v, '%s' % kw[k])}
            else:
                changed = True
                ref = Ref(name='%s %s' % (kw.get('author', 'na'), kw.get('year', 'nd')), **kw)

            ref.description = ref.title or ref.booktitle
            originator = ref.author or ref.editor or 'Anonymous'
            ref.name = '%s %s' % (originator, ref.year or 'n.d.')

            a, r = update_relationship(
                ref.macroareas,
                [macroarea_map[name] for name in
                 set(filter(None, [s.strip() for s in kw['jsondata'].get('macro_area', '').split(',')]))])
            changed = changed or a or r

            src = [s.strip() for s in kw['jsondata'].get('src', '').split(',')]
            prv = {provider_map[slug(s)] for s in src if s}
            if set(ref.providers) != prv:
                ref.providers = list(prv)
                changed = True

            a, r = update_relationship(
                ref.doctypes,
                [doctype_map[m.group('name')] for m in
                 DOCTYPE_PATTERN.finditer(kw['jsondata'].get('hhtype', ''))])
            changed = changed or a or r

            if not update:
                stats.update(['new'])
                DBSession.add(ref)
            elif changed:
                stats.update(['updated'])

    args.log.info('%s' % stats)

    DBSession.execute("update source set description = title where description is null and title is not null;")
    DBSession.execute("update source set description = booktitle where description is null and booktitle is not null;")

    for row in list(DBSession.execute(
            "select pk, pages, pages_int, startpage_int from source where pages_int < 0")):
        pk, pages, number, start = row
        _start, _end, _number = compute_pages(pages)
        if _number > 0 and _number != number:
            DBSession.execute(
                "update source set pages_int = %s, startpage_int = %s where pk = %s" %
                (_number, _start, pk))
            DBSession.execute(
                "update ref set endpage_int = %s where pk = %s" %
                (_end, pk))

    jsondump(changes, args.data_dir.joinpath('references', 'changes.json'))
Ejemplo n.º 12
0
def main(bib, mode):  # pragma: no cover
    count = 0
    skipped = 0

    with transaction.manager:
        provider_map = get_map(Provider)
        macroarea_map = get_map(Macroarea)
        doctype_map = get_map(Doctype)

        known_ids = set(r[0] for r in DBSession.query(Ref.pk))

        languoid_map = {}
        for l in DBSession.query(Languoid):
            if l.hid:
                languoid_map[l.hid] = l
            languoid_map[l.id] = l

        for i, rec in enumerate(bib):
            if len(rec.keys()) < 6:
                skipped += 1
                #print '---> skip', rec.id
                #print rec
                continue

            changed = False
            assert rec.get('glottolog_ref_id')
            id_ = int(rec.get('glottolog_ref_id'))
            if mode != 'update' and id_ in known_ids:
                continue
            ref = DBSession.query(Source).get(id_)
            update = True if ref else False

            kw = {
                'pk': id_,
                'bibtex_type': getattr(EntryType, rec.genre),
                'id': str(id_),
                'jsondata': {'bibtexkey': rec.id},
            }

            for source, target in FIELD_MAP.items():
                value = rec.get(source)
                if value:
                    value = unescape(value)
                    if target:
                        kw[target] = CONVERTER.get(source, lambda x: x)(value)
                    else:
                        kw['jsondata'][source] = value

            # try to extract numeric year, startpage, endpage, numberofpages, ...
            if rec.get('numberofpages'):
                try:
                    kw['pages_int'] = int(rec.get('numberofpages').strip())
                except ValueError:
                    pass

            if kw.get('year'):
                match = YEAR_PATTERN.search(kw.get('year'))
                if match:
                    kw['year_int'] = int(match.group('year'))

            if kw.get('publisher'):
                p = kw.get('publisher')
                if ':' in p:
                    address, publisher = [s.strip() for s in kw['publisher'].split(':', 1)]
                    if not 'address' in kw or kw['address'] == address:
                        kw['address'], kw['publisher'] = address, publisher

            if kw.get('pages'):
                pages = kw.get('pages')
                match = ROMANPAGESPATTERNra.search(pages)
                if not match:
                    match = ROMANPAGESPATTERNar.search(pages)
                if match:
                    if 'pages_int' not in kw:
                        kw['pages_int'] = roman_to_int(match.group('roman')) \
                            + int(match.group('arabic'))
                else:
                    start = None
                    number = None
                    match = None

                    for match in PAGES_PATTERN.finditer(pages):
                        if start is None:
                            start = int(match.group('start'))
                        number = (number or 0) \
                            + (int(match.group('end')) - int(match.group('start')) + 1)

                    if match:
                        kw['endpage_int'] = int(match.group('end'))
                        kw['startpage_int'] = start
                        kw.setdefault('pages_int', number)
                    else:
                        try:
                            kw['startpage_int'] = int(pages)
                        except ValueError:
                            pass

            if update:
                for k in kw.keys():
                    if k == 'pk':
                        continue
                    #if k == 'title':
                    #    v = ref.title or ref.description
                    #else:
                    if 1:
                        v = getattr(ref, k)
                    if kw[k] != v:
                        #
                        # TODO!
                        #
                        setattr(ref, k, kw[k])
                        #if k not in ['jsondata', 'publisher']:
                        #    print k, ref.pk
                        #    print kw[k]
                        #    print v
                        #    print '--------------'
                        changed = True
                    if ref.title:
                        ref.description = ref.title
            else:
                changed = True
                ref = Ref(**kw)

            def append(attr, obj):
                if obj and obj not in attr:
                    changed = True
                    #
                    # TODO!
                    #
                    attr.append(obj)

            for name in set(filter(None, [s.strip() for s in kw['jsondata'].get('macro_area', '').split(',')])):
                append(ref.macroareas, macroarea_map[name])

            for name in set(filter(None, [s.strip() for s in kw['jsondata'].get('src', '').split(',')])):
                append(ref.providers, provider_map[slug(name)])

            for m in DOCTYPE_PATTERN.finditer(kw['jsondata'].get('hhtype', '')):
                append(ref.doctypes, doctype_map[m.group('name')])

            if len(kw['jsondata'].get('lgcode', '')) == 3:
                kw['jsondata']['lgcode'] = '[%s]' % kw['jsondata']['lgcode']

            for m in CODE_PATTERN.finditer(kw['jsondata'].get('lgcode', '')):
                for code in set(m.group('code').split(',')):
                    if code not in languoid_map:
                        if code not in ['NOCODE_Payagua', 'emx']:
                            print '--> unknown code:', code.encode('utf8')
                    else:
                        append(ref.languages, languoid_map[code])

            for glottocode in filter(None, kw['jsondata'].get('alnumcodes', '').split(';')):
                if glottocode not in languoid_map:
                    print '--> unknown glottocode:', glottocode.encode('utf8')
                else:
                    append(ref.languages, languoid_map[glottocode])

            if not update:
                #pass
                #
                # TODO!
                #
                DBSession.add(ref)

            if i % 100 == 0:
                print i, 'records done'

            if changed:
                count += 1

        print count, 'records updated or imported'
        print skipped, 'records skipped because of lack of information'