コード例 #1
0
ファイル: util.py プロジェクト: kublaj/glottolog3
def update_providers(args):
    if not args.data_file(args.version, 'provider.txt').exists():
        return

    with open(args.data_file(args.version, 'provider.txt')) as fp:
        content = fp.read().decode('latin1')

    if '\r\n' in content:
        content = content.replace('\r\n', '\n')

    provider_map = get_map(Provider)
    for block in content.split('\n\n\n\n'):
        lines = block.split('\n')
        id_, abbr = lines[0].strip().split(':')
        id_ = id_.split('.')[0]
        description = unescape('\n'.join(lines[1:]))
        name = description.split('.')[0]

        if id_ == 'hedvig-tirailleur':
            id_ = u'skirgard'

        if slug(id_) not in provider_map:
            args.log.info('adding provider %s' % slug(id_))
            DBSession.add(
                Provider(id=slug(id_), name=name, description=description, abbr=abbr))
コード例 #2
0
def split_families(fp):
    """generator for (node, leafs) pairs parsed from Harald's classification format.
    """
    def normalized_branch(line):
        """parse a line specifying a language family as comma separated list of
        ancestors.
        """
        branch = [unescape(n.strip().replace('_', ' ')) for n in line.split(',')]
        name_map = {
            'Deaf Sign Language': 'Sign Languages',
            'Unclassifiable': 'Unclassified',
            'Artificial Language': 'Artificial Language',
            'Mixed Language': 'Mixed Language',
            'Pidgin': 'Pidgin',
            #'Unattested': 'Unattested',
        }
        if branch[0] in name_map:
            return (
                [name_map[branch[0]]],
                'established' if branch[0] != 'Unattested' else 'unattested',
                ', '.join(branch[1:]))

        if branch[0] in ['Spurious', 'Speech Register', 'Unattested']:
            comment = ''
            if branch[0] == 'Speech Register':
                status = 'established'
                comment = 'speech register'
            else:
                status = branch[0].lower()
            if branch[0] == 'Unattested' and len(branch) == 1:
                # unattested languages without classification should not be treated as
                # isolates!
                branch[0] = 'Unclassified'
            else:
                branch = branch[1:]
            if branch and branch[0] in ['Retired']:
                status += ' retired'
                branch = branch[1:]
            return branch, status, ''

        return branch, 'established', ''

    family = None
    for line in fp.read().split('\n'):
        if not line.strip():
            continue
        if line.startswith('  '):
            name, code = line.strip().split('[')
            code = code.split(']')[0].replace('\\', '').replace('"', '').replace("'", '')
            code = code.replace('NOCODE-', 'NOCODE_')
            assert code
            assert len(code) == 3 or NOCODE_PATTERN.match(code)
            family[1][code] = unescape(name.strip().replace('_', ' '))
        else:
            if family:
                yield family
            family = [normalized_branch(line), {}]
    yield family
コード例 #3
0
    def normalized_branch(line):
        """parse a line specifying a language family as comma separated list of
        ancestors.
        """
        branch = [
            unescape(n.strip().replace('_', ' ')) for n in line.split(',')
        ]
        name_map = {
            'Deaf Sign Language': 'Sign Languages',
            'Unclassifiable': 'Unclassified',
            'Artificial Language': 'Artificial Language',
            'Mixed Language': 'Mixed Language',
            'Pidgin': 'Pidgin',
            #'Unattested': 'Unattested',
        }
        if branch[0] in name_map:
            return ([
                name_map[branch[0]]
            ], 'established' if branch[0] != 'Unattested' else 'unattested',
                    ', '.join(branch[1:]))

        if branch[0] in ['Spurious', 'Speech Register', 'Unattested']:
            comment = ''
            if branch[0] == 'Speech Register':
                status = 'established'
                comment = 'speech register'
            else:
                status = branch[0].lower()
            if branch[0] == 'Unattested' and len(branch) == 1:
                # unattested languages without classification should not be treated as
                # isolates!
                branch[0] = 'Unclassified'
            else:
                branch = branch[1:]
            if branch and branch[0] in ['Retired']:
                status += ' retired'
                branch = branch[1:]
            return branch, status, ''

        return branch, 'established', ''
コード例 #4
0
    def normalized_branch(line):
        """parse a line specifying a language family as comma separated list of
        ancestors.
        """
        branch = [unescape(n.strip().replace('_', ' ')) for n in line.split(',')]
        name_map = {
            'Deaf Sign Language': 'Sign Languages',
            'Unclassifiable': 'Unclassified',
            'Artificial Language': 'Artificial Language',
            'Mixed Language': 'Mixed Language',
            'Pidgin': 'Pidgin',
            #'Unattested': 'Unattested',
        }
        if branch[0] in name_map:
            return (
                [name_map[branch[0]]],
                'established' if branch[0] != 'Unattested' else 'unattested',
                ', '.join(branch[1:]))

        comment = ''
        if branch[0] in ['Spurious', 'Speech Register', 'Unattested']:
            if branch[0] == 'Speech Register':
                status = 'established'
                comment = 'speech register'
            else:
                status = branch[0].lower()
            if branch[0] == 'Unattested' and len(branch) == 1:
                # unattested languages without classification should not be treated as
                # isolates!
                branch[0] = 'Unclassified'
            else:
                branch = branch[1:]
            if branch and branch[0] in ['Retired']:
                status += ' retired'
                branch = branch[1:]
            return branch, status, ''

        return branch, 'established', comment
コード例 #5
0
 def normalized_branch(line):
     """parse a line specifying a language family as comma separated list of
     ancestors.
     """
     name_map = {
         'Unattested',  # keep top-level family as subfamily
         'Unclassifiable',  # keep top-level family as subfamily
         'Pidgin',  # keep top-level family as subfamily
         'Mixed Language',  # keep top-level family as subfamily
         'Artificial Language',  # keep top-level family as subfamily
         'Speech Register',  # keep top-level family as subfamily
         # FIXME: also 'Sign Language'?
         'Spurious',  # bookkeeping 'Preliminary'
     }
     branch = [
         unescape(n.strip().replace('_', ' ')) for n in line.split(',')
     ]
     if branch[0] not in name_map:
         return branch, 'established'
     family = branch.pop(0)
     subfamily = None
     retired = False
     if branch:
         # there's a second level!
         if family == 'Spurious':
             if branch[0] == 'Retired':
                 retired = True
                 branch.pop(0)
         else:
             subfamily = '%s (%s)' % (branch.pop(0), family)
     status = 'established'
     if family in ['Spurious', 'Unattested']:
         status = family.lower()
         if retired:
             status += ' retired'
     if family == 'Spurious':
         family = BOOKKEEPING
     return nfilter([family, subfamily]), status
コード例 #6
0
 def normalized_branch(line):
     """parse a line specifying a language family as comma separated list of
     ancestors.
     """
     name_map = {
         "Unattested",  # keep top-level family as subfamily
         "Unclassifiable",  # keep top-level family as subfamily
         "Pidgin",  # keep top-level family as subfamily
         "Mixed Language",  # keep top-level family as subfamily
         "Artificial Language",  # keep top-level family as subfamily
         "Speech Register",  # keep top-level family as subfamily
         # FIXME: also 'Sign Language'?
         "Spurious",  # bookkeeping 'Preliminary'
     }
     branch = [unescape(n.strip().replace("_", " ")) for n in line.split(",")]
     if branch[0] not in name_map:
         return branch, "established"
     family = branch.pop(0)
     subfamily = None
     retired = False
     if branch:
         # there's a second level!
         if family == "Spurious":
             if branch[0] == "Retired":
                 retired = True
                 branch.pop(0)
         else:
             subfamily = "%s (%s)" % (branch.pop(0), family)
     status = "established"
     if family in ["Spurious", "Unattested"]:
         status = family.lower()
         if retired:
             status += " retired"
     if family == "Spurious":
         family = BOOKKEEPING
     return nfilter([family, subfamily]), status
コード例 #7
0
ファイル: import_refs.py プロジェクト: kublaj/glottolog3
def main(args):  # pragma: no cover
    bib = Database.from_file(args.data_file(args.version, 'refs.bib'), encoding='utf8')
    count = 0
    skipped = 0
    changes = {}

    with transaction.manager:
        update_providers(args)
        DBSession.flush()
        provider_map = get_map(Provider)
        macroarea_map = get_map(Macroarea)
        doctype_map = get_map(Doctype)

        known_ids = set(r[0] for r in DBSession.query(Ref.pk))

        languoid_map = {}
        for l in DBSession.query(Languoid):
            if l.hid:
                languoid_map[l.hid] = l
            languoid_map[l.id] = l

        for i, rec in enumerate(bib):
            if i and i % 1000 == 0:
                print i, 'records done', count, 'changed'

            if len(rec.keys()) < 6:
                # not enough information!
                skipped += 1
                continue

            changed = False
            assert rec.get('glottolog_ref_id')
            id_ = int(rec.get('glottolog_ref_id'))
            if args.mode != 'update' and id_ in known_ids:
                continue
            ref = DBSession.query(Source).get(id_)
            update = True if ref else False

            kw = {
                'pk': id_,
                'bibtex_type': rec.genre,
                'id': str(id_),
                'jsondata': {'bibtexkey': rec.id},
            }

            for source, target in FIELD_MAP.items():
                if target is None:
                    continue
                value = rec.get(source)
                if value:
                    value = unescape(value)
                    if target:
                        kw[target] = CONVERTER.get(source, lambda x: x)(value)
                    else:
                        kw['jsondata'][source] = value

            if kw['jsondata'].get('hhtype'):
                trigger = ca_trigger(kw['jsondata']['hhtype'])
                if trigger:
                    kw['ca_doctype_trigger'], kw['jsondata']['hhtype'] = trigger

            # try to extract numeric year, startpage, endpage, numberofpages, ...
            if kw.get('year'):
                # prefer years in brackets over the first 4-digit number.
                match = PREF_YEAR_PATTERN.search(kw.get('year'))
                if match:
                    kw['year_int'] = int(match.group('year'))
                else:
                    match = YEAR_PATTERN.search(kw.get('year'))
                    if match:
                        kw['year_int'] = int(match.group('year'))

            if kw.get('publisher'):
                p = kw.get('publisher')
                if ':' in p:
                    address, publisher = [s.strip() for s in kw['publisher'].split(':', 1)]
                    if not 'address' in kw or kw['address'] == address:
                        kw['address'], kw['publisher'] = address, publisher

            if rec.get('numberofpages'):
                try:
                    kw['pages_int'] = int(rec.get('numberofpages').strip())
                except ValueError:
                    pass

            if kw.get('pages'):
                start, end, number = compute_pages(kw['pages'])
                if start is not None:
                    kw['startpage_int'] = start
                if end is not None:
                    kw['endpage_int'] = end
                if number is not None and 'pages_int' not in kw:
                    kw['pages_int'] = number

            if update:
                for k in kw.keys():
                    if k == 'pk':
                        continue
                    v = getattr(ref, k)
                    if kw[k] != v:
                        if k == 'jsondata':
                            d = ref.jsondata or {}
                            d.update(**kw[k])
                            for s, t in FIELD_MAP.items():
                                if t is None and s in d:
                                    del d[s]
                            ref.jsondata = d
                        else:
                            print k, '--', v
                            print k, '++', kw[k]
                            setattr(ref, k, kw[k])
                            changed = True
                            if ref.id in changes:
                                changes[ref.id][k] = ('%s' % v, '%s' % kw[k])
                            else:
                                changes[ref.id] = {k: ('%s' % v, '%s' % kw[k])}
            else:
                changed = True
                ref = Ref(name='%s %s' % (kw.get('author', 'na'), kw.get('year', 'nd')), **kw)

            ref.description = ref.title or ref.booktitle
            ref.name = '%s %s' % (ref.author or 'n.a.', ref.year or 'n.d.')

            def append(attr, obj):
                if obj and obj not in attr:
                    attr.append(obj)
                    return True

            a, r = update_relationship(
                ref.macroareas,
                [macroarea_map[name] for name in
                 set(filter(None, [s.strip() for s in kw['jsondata'].get('macro_area', '').split(',')]))])
            changed = changed or a or r

            for name in set(filter(None, [s.strip() for s in kw['jsondata'].get('src', '').split(',')])):
                result = append(ref.providers, provider_map[slug(name)])
                changed = changed or result

            a, r = update_relationship(
                ref.doctypes,
                [doctype_map[m.group('name')] for m in
                 DOCTYPE_PATTERN.finditer(kw['jsondata'].get('hhtype', ''))])
            changed = changed or a or r

            if not update:
                DBSession.add(ref)

            if changed:
                count += 1
                ref.doctypes_str = ', '.join(o.id for o in ref.doctypes)
                ref.providers_str = ', '.join(o.id for o in ref.providers)

        print count, 'records updated or imported'
        print skipped, 'records skipped because of lack of information'

    DBSession.execute("update source set description = title where description is null and title is not null;")
    DBSession.execute("update source set description = booktitle where description is null and booktitle is not null;")

    for row in list(DBSession.execute(
            "select pk, pages, pages_int, startpage_int from source where pages_int < 0")):
        pk, pages, number, start = row
        _start, _end, _number = compute_pages(pages)
        if _number > 0 and _number != number:
            DBSession.execute(
                "update source set pages_int = %s, startpage_int = %s where pk = %s",
                (_number, _start, pk))
            DBSession.execute(
                "update ref set endpage_int = %s where pk = %s",
                (_end, pk))

    return changes
コード例 #8
0
def split_families(fp):
    """generator for (node, leafs) pairs parsed from Harald's classification format.
    """
    def normalized_branch(line):
        """parse a line specifying a language family as comma separated list of
        ancestors.
        """
        branch = [
            unescape(n.strip().replace('_', ' ')) for n in line.split(',')
        ]
        name_map = {
            'Deaf Sign Language': 'Sign Languages',
            'Unclassifiable': 'Unclassified',
            'Artificial Language': 'Artificial Language',
            'Mixed Language': 'Mixed Language',
            'Pidgin': 'Pidgin',
            #'Unattested': 'Unattested',
        }
        if branch[0] in name_map:
            return ([
                name_map[branch[0]]
            ], 'established' if branch[0] != 'Unattested' else 'unattested',
                    ', '.join(branch[1:]))

        if branch[0] in ['Spurious', 'Speech Register', 'Unattested']:
            comment = ''
            if branch[0] == 'Speech Register':
                status = 'established'
                comment = 'speech register'
            else:
                status = branch[0].lower()
            if branch[0] == 'Unattested' and len(branch) == 1:
                # unattested languages without classification should not be treated as
                # isolates!
                branch[0] = 'Unclassified'
            else:
                branch = branch[1:]
            if branch and branch[0] in ['Retired']:
                status += ' retired'
                branch = branch[1:]
            return branch, status, ''

        return branch, 'established', ''

    family = None
    for line in fp.read().split('\n'):
        if not line.strip():
            continue
        if line.startswith('  '):
            name, code = line.strip().split('[')
            code = code.split(']')[0].replace('\\',
                                              '').replace('"',
                                                          '').replace("'", '')
            code = code.replace('NOCODE-', 'NOCODE_')
            assert code
            assert len(code) == 3 or NOCODE_PATTERN.match(code)
            family[1][code] = unescape(name.strip().replace('_', ' '))
        else:
            if family:
                yield family
            family = [normalized_branch(line), {}]
    yield family
コード例 #9
0
def main(bib, mode):  # pragma: no cover
    count = 0
    skipped = 0

    with transaction.manager:
        provider_map = get_map(Provider)
        macroarea_map = get_map(Macroarea)
        doctype_map = get_map(Doctype)

        known_ids = set(r[0] for r in DBSession.query(Ref.pk))

        languoid_map = {}
        for l in DBSession.query(Languoid):
            if l.hid:
                languoid_map[l.hid] = l
            languoid_map[l.id] = l

        for i, rec in enumerate(bib):
            if len(rec.keys()) < 6:
                skipped += 1
                #print '---> skip', rec.id
                #print rec
                continue

            changed = False
            assert rec.get('glottolog_ref_id')
            id_ = int(rec.get('glottolog_ref_id'))
            if mode != 'update' and id_ in known_ids:
                continue
            ref = DBSession.query(Source).get(id_)
            update = True if ref else False

            kw = {
                'pk': id_,
                'bibtex_type': getattr(EntryType, rec.genre),
                'id': str(id_),
                'jsondata': {
                    'bibtexkey': rec.id
                },
            }

            for source, target in FIELD_MAP.items():
                value = rec.get(source)
                if value:
                    value = unescape(value)
                    if target:
                        kw[target] = CONVERTER.get(source, lambda x: x)(value)
                    else:
                        kw['jsondata'][source] = value

            # try to extract numeric year, startpage, endpage, numberofpages, ...
            if rec.get('numberofpages'):
                try:
                    kw['pages_int'] = int(rec.get('numberofpages').strip())
                except ValueError:
                    pass

            if kw.get('year'):
                match = YEAR_PATTERN.search(kw.get('year'))
                if match:
                    kw['year_int'] = int(match.group('year'))

            if kw.get('publisher'):
                p = kw.get('publisher')
                if ':' in p:
                    address, publisher = [
                        s.strip() for s in kw['publisher'].split(':', 1)
                    ]
                    if not 'address' in kw or kw['address'] == address:
                        kw['address'], kw['publisher'] = address, publisher

            if kw.get('pages'):
                pages = kw.get('pages')
                match = ROMANPAGESPATTERNra.search(pages)
                if not match:
                    match = ROMANPAGESPATTERNar.search(pages)
                if match:
                    if 'pages_int' not in kw:
                        kw['pages_int'] = roman_to_int(match.group('roman')) \
                            + int(match.group('arabic'))
                else:
                    start = None
                    number = None
                    match = None

                    for match in PAGES_PATTERN.finditer(pages):
                        if start is None:
                            start = int(match.group('start'))
                        number = (number or 0) \
                            + (int(match.group('end')) - int(match.group('start')) + 1)

                    if match:
                        kw['endpage_int'] = int(match.group('end'))
                        kw['startpage_int'] = start
                        kw.setdefault('pages_int', number)
                    else:
                        try:
                            kw['startpage_int'] = int(pages)
                        except ValueError:
                            pass

            if update:
                for k in kw.keys():
                    if k == 'pk':
                        continue
                    #if k == 'title':
                    #    v = ref.title or ref.description
                    #else:
                    if 1:
                        v = getattr(ref, k)
                    if kw[k] != v:
                        #
                        # TODO!
                        #
                        setattr(ref, k, kw[k])
                        #if k not in ['jsondata', 'publisher']:
                        #    print k, ref.pk
                        #    print kw[k]
                        #    print v
                        #    print '--------------'
                        changed = True
                    if ref.title:
                        ref.description = ref.title
            else:
                changed = True
                ref = Ref(**kw)

            def append(attr, obj):
                if obj and obj not in attr:
                    changed = True
                    #
                    # TODO!
                    #
                    attr.append(obj)

            for name in set(
                    filter(None, [
                        s.strip() for s in kw['jsondata'].get(
                            'macro_area', '').split(',')
                    ])):
                append(ref.macroareas, macroarea_map[name])

            for name in set(
                    filter(None, [
                        s.strip()
                        for s in kw['jsondata'].get('src', '').split(',')
                    ])):
                append(ref.providers, provider_map[slug(name)])

            for m in DOCTYPE_PATTERN.finditer(kw['jsondata'].get('hhtype',
                                                                 '')):
                append(ref.doctypes, doctype_map[m.group('name')])

            if len(kw['jsondata'].get('lgcode', '')) == 3:
                kw['jsondata']['lgcode'] = '[%s]' % kw['jsondata']['lgcode']

            for m in CODE_PATTERN.finditer(kw['jsondata'].get('lgcode', '')):
                for code in set(m.group('code').split(',')):
                    if code not in languoid_map:
                        if code not in ['NOCODE_Payagua', 'emx']:
                            print '--> unknown code:', code.encode('utf8')
                    else:
                        append(ref.languages, languoid_map[code])

            for glottocode in filter(
                    None, kw['jsondata'].get('alnumcodes', '').split(';')):
                if glottocode not in languoid_map:
                    print '--> unknown glottocode:', glottocode.encode('utf8')
                else:
                    append(ref.languages, languoid_map[glottocode])

            if not update:
                #pass
                #
                # TODO!
                #
                DBSession.add(ref)

            if i % 100 == 0:
                print i, 'records done'

            if changed:
                count += 1

        print count, 'records updated or imported'
        print skipped, 'records skipped because of lack of information'
コード例 #10
0
def main(args):  # pragma: no cover
    stats = Counter(new=0, updated=0, skipped=0)
    changes = {}

    with transaction.manager:
        update_providers(args)
        DBSession.flush()
        provider_map = get_map(Provider)
        macroarea_map = get_map(Macroarea)
        doctype_map = get_map(Doctype)

        languoid_map = {}
        for l in DBSession.query(Languoid):
            if l.hid:
                languoid_map[l.hid] = l
            languoid_map[l.id] = l

        for i, rec in enumerate(get_bib(args)):
            if i and i % 1000 == 0:
                print i, 'records done', stats['updated'] + stats['new'], 'changed'

            if len(rec.keys()) < 6:
                # not enough information!
                stats.update(['skipped'])
                continue

            changed = False
            assert rec.get('glottolog_ref_id')
            id_ = int(rec.get('glottolog_ref_id'))

            ref = DBSession.query(Source).get(id_)
            update = True if ref else False

            kw = {
                'pk': id_,
                'bibtex_type': rec.genre,
                'id': str(id_),
                'jsondata': {'bibtexkey': rec.id},
            }

            for source, target in FIELD_MAP.items():
                if target is None:
                    continue
                value = rec.get(source)
                if value:
                    value = unescape(value)
                    if target:
                        kw[target] = CONVERTER.get(source, lambda x: x)(value)
                    else:
                        kw['jsondata'][source] = value

            if kw['jsondata'].get('hhtype'):
                trigger = ca_trigger(kw['jsondata']['hhtype'])
                if trigger:
                    kw['ca_doctype_trigger'], kw['jsondata']['hhtype'] = trigger

            # try to extract numeric year, startpage, endpage, numberofpages, ...
            if kw.get('year'):
                # prefer years in brackets over the first 4-digit number.
                match = PREF_YEAR_PATTERN.search(kw.get('year'))
                if match:
                    kw['year_int'] = int(match.group('year'))
                else:
                    match = YEAR_PATTERN.search(kw.get('year'))
                    if match:
                        kw['year_int'] = int(match.group('year'))

            if kw.get('publisher'):
                p = kw.get('publisher')
                if ':' in p:
                    address, publisher = [s.strip() for s in kw['publisher'].split(':', 1)]
                    if 'address' not in kw or kw['address'] == address:
                        kw['address'], kw['publisher'] = address, publisher

            if rec.get('numberofpages'):
                try:
                    kw['pages_int'] = int(rec.get('numberofpages').strip())
                except ValueError:
                    pass

            if kw.get('pages'):
                start, end, number = compute_pages(kw['pages'])
                if start is not None:
                    kw['startpage_int'] = start
                if end is not None:
                    kw['endpage_int'] = end
                if number is not None and 'pages_int' not in kw:
                    kw['pages_int'] = number

            for k in kw.keys():
                v = kw[k]
                if isinstance(v, basestring):
                    v = v.strip() or None
                kw[k] = v

            if update:
                for k in kw.keys():
                    if k == 'pk':
                        continue
                    v = getattr(ref, k)
                    if kw[k] != v:
                        if k == 'jsondata':
                            d = {k: v for k, v in ref.jsondata.items()
                                 if k in NONREF_JSONDATA}
                            d.update(**kw[k])
                            ref.jsondata = d
                        else:
                            #print k, '--', v
                            #print k, '++', kw[k]
                            setattr(ref, k, kw[k])
                            changed = True
                            if ref.id in changes:
                                changes[ref.id][k] = ('%s' % v, '%s' % kw[k])
                            else:
                                changes[ref.id] = {k: ('%s' % v, '%s' % kw[k])}
            else:
                changed = True
                ref = Ref(name='%s %s' % (kw.get('author', 'na'), kw.get('year', 'nd')), **kw)

            ref.description = ref.title or ref.booktitle
            originator = ref.author or ref.editor or 'Anonymous'
            ref.name = '%s %s' % (originator, ref.year or 'n.d.')

            a, r = update_relationship(
                ref.macroareas,
                [macroarea_map[name] for name in
                 set(filter(None, [s.strip() for s in kw['jsondata'].get('macro_area', '').split(',')]))])
            changed = changed or a or r

            src = [s.strip() for s in kw['jsondata'].get('src', '').split(',')]
            prv = {provider_map[slug(s)] for s in src if s}
            if set(ref.providers) != prv:
                ref.providers = list(prv)
                changed = True

            a, r = update_relationship(
                ref.doctypes,
                [doctype_map[m.group('name')] for m in
                 DOCTYPE_PATTERN.finditer(kw['jsondata'].get('hhtype', ''))])
            changed = changed or a or r

            if not update:
                stats.update(['new'])
                DBSession.add(ref)
            elif changed:
                stats.update(['updated'])

    args.log.info('%s' % stats)

    DBSession.execute("update source set description = title where description is null and title is not null;")
    DBSession.execute("update source set description = booktitle where description is null and booktitle is not null;")

    for row in list(DBSession.execute(
            "select pk, pages, pages_int, startpage_int from source where pages_int < 0")):
        pk, pages, number, start = row
        _start, _end, _number = compute_pages(pages)
        if _number > 0 and _number != number:
            DBSession.execute(
                "update source set pages_int = %s, startpage_int = %s where pk = %s" %
                (_number, _start, pk))
            DBSession.execute(
                "update ref set endpage_int = %s where pk = %s" %
                (_end, pk))

    jsondump(changes, args.data_dir.joinpath('references', 'changes.json'))
コード例 #11
0
def split_families(fp):
    """generator for (node, leafs) pairs parsed from Harald's classification format.
    """

    def normalized_branch(line):
        """parse a line specifying a language family as comma separated list of
        ancestors.
        """
        name_map = {
            "Unattested",  # keep top-level family as subfamily
            "Unclassifiable",  # keep top-level family as subfamily
            "Pidgin",  # keep top-level family as subfamily
            "Mixed Language",  # keep top-level family as subfamily
            "Artificial Language",  # keep top-level family as subfamily
            "Speech Register",  # keep top-level family as subfamily
            # FIXME: also 'Sign Language'?
            "Spurious",  # bookkeeping 'Preliminary'
        }
        branch = [unescape(n.strip().replace("_", " ")) for n in line.split(",")]
        if branch[0] not in name_map:
            return branch, "established"
        family = branch.pop(0)
        subfamily = None
        retired = False
        if branch:
            # there's a second level!
            if family == "Spurious":
                if branch[0] == "Retired":
                    retired = True
                    branch.pop(0)
            else:
                subfamily = "%s (%s)" % (branch.pop(0), family)
        status = "established"
        if family in ["Spurious", "Unattested"]:
            status = family.lower()
            if retired:
                status += " retired"
        if family == "Spurious":
            family = BOOKKEEPING
        return nfilter([family, subfamily]), status

    family = None
    for line in fp.read().split("\n"):
        if not line.strip():
            continue
        if line.strip().endswith("TODO"):
            print "ignoring:", line
            continue
        if line.startswith("  "):
            name, code = line.strip().split("[")
            code = code.split("]")[0].replace("\\", "").replace('"', "").replace("'", "")
            code = code.replace("NOCODE-", "NOCODE_")
            try:
                assert len(code) == 3 or NOCODE_PATTERN.match(code)
            except:
                raise ValueError(code)
            family[1][code] = unescape(name.strip().replace("_", " "))
        else:
            if family:
                yield family
            family = [normalized_branch(line), {}]
    yield family
コード例 #12
0
def split_families(fp):
    """generator for (node, leafs) pairs parsed from Harald's classification format.
    """
    def normalized_branch(line):
        """parse a line specifying a language family as comma separated list of
        ancestors.
        """
        name_map = {
            'Unattested',  # keep top-level family as subfamily
            'Unclassifiable',  # keep top-level family as subfamily
            'Pidgin',  # keep top-level family as subfamily
            'Mixed Language',  # keep top-level family as subfamily
            'Artificial Language',  # keep top-level family as subfamily
            'Speech Register',  # keep top-level family as subfamily
            # FIXME: also 'Sign Language'?
            'Spurious',  # bookkeeping 'Preliminary'
        }
        branch = [
            unescape(n.strip().replace('_', ' ')) for n in line.split(',')
        ]
        if branch[0] not in name_map:
            return branch, 'established'
        family = branch.pop(0)
        subfamily = None
        retired = False
        if branch:
            # there's a second level!
            if family == 'Spurious':
                if branch[0] == 'Retired':
                    retired = True
                    branch.pop(0)
            else:
                subfamily = '%s (%s)' % (branch.pop(0), family)
        status = 'established'
        if family in ['Spurious', 'Unattested']:
            status = family.lower()
            if retired:
                status += ' retired'
        if family == 'Spurious':
            family = BOOKKEEPING
        return nfilter([family, subfamily]), status

    family = None
    for line in fp.read().split('\n'):
        if not line.strip():
            continue
        if line.strip().endswith('TODO'):
            print 'ignoring:', line
            continue
        if line.startswith('  '):
            name, code = line.strip().split('[')
            code = code.split(']')[0].replace('\\',
                                              '').replace('"',
                                                          '').replace("'", '')
            code = code.replace('NOCODE-', 'NOCODE_')
            try:
                assert len(code) == 3 or NOCODE_PATTERN.match(code)
            except:
                raise ValueError(code)
            family[1][code] = unescape(name.strip().replace('_', ' '))
        else:
            if family:
                yield family
            family = [normalized_branch(line), {}]
    yield family
コード例 #13
0
ファイル: import_refs.py プロジェクト: mitcho/glottolog3
def main(bib, mode):  # pragma: no cover
    count = 0
    skipped = 0

    with transaction.manager:
        provider_map = get_map(Provider)
        macroarea_map = get_map(Macroarea)
        doctype_map = get_map(Doctype)

        known_ids = set(r[0] for r in DBSession.query(Ref.pk))

        languoid_map = {}
        for l in DBSession.query(Languoid):
            if l.hid:
                languoid_map[l.hid] = l
            languoid_map[l.id] = l

        for i, rec in enumerate(bib):
            if len(rec.keys()) < 6:
                skipped += 1
                #print '---> skip', rec.id
                #print rec
                continue

            changed = False
            assert rec.get('glottolog_ref_id')
            id_ = int(rec.get('glottolog_ref_id'))
            if mode != 'update' and id_ in known_ids:
                continue
            ref = DBSession.query(Source).get(id_)
            update = True if ref else False

            kw = {
                'pk': id_,
                'bibtex_type': getattr(EntryType, rec.genre),
                'id': str(id_),
                'jsondata': {'bibtexkey': rec.id},
            }

            for source, target in FIELD_MAP.items():
                value = rec.get(source)
                if value:
                    value = unescape(value)
                    if target:
                        kw[target] = CONVERTER.get(source, lambda x: x)(value)
                    else:
                        kw['jsondata'][source] = value

            # try to extract numeric year, startpage, endpage, numberofpages, ...
            if rec.get('numberofpages'):
                try:
                    kw['pages_int'] = int(rec.get('numberofpages').strip())
                except ValueError:
                    pass

            if kw.get('year'):
                match = YEAR_PATTERN.search(kw.get('year'))
                if match:
                    kw['year_int'] = int(match.group('year'))

            if kw.get('publisher'):
                p = kw.get('publisher')
                if ':' in p:
                    address, publisher = [s.strip() for s in kw['publisher'].split(':', 1)]
                    if not 'address' in kw or kw['address'] == address:
                        kw['address'], kw['publisher'] = address, publisher

            if kw.get('pages'):
                pages = kw.get('pages')
                match = ROMANPAGESPATTERNra.search(pages)
                if not match:
                    match = ROMANPAGESPATTERNar.search(pages)
                if match:
                    if 'pages_int' not in kw:
                        kw['pages_int'] = roman_to_int(match.group('roman')) \
                            + int(match.group('arabic'))
                else:
                    start = None
                    number = None
                    match = None

                    for match in PAGES_PATTERN.finditer(pages):
                        if start is None:
                            start = int(match.group('start'))
                        number = (number or 0) \
                            + (int(match.group('end')) - int(match.group('start')) + 1)

                    if match:
                        kw['endpage_int'] = int(match.group('end'))
                        kw['startpage_int'] = start
                        kw.setdefault('pages_int', number)
                    else:
                        try:
                            kw['startpage_int'] = int(pages)
                        except ValueError:
                            pass

            if update:
                for k in kw.keys():
                    if k == 'pk':
                        continue
                    #if k == 'title':
                    #    v = ref.title or ref.description
                    #else:
                    if 1:
                        v = getattr(ref, k)
                    if kw[k] != v:
                        #
                        # TODO!
                        #
                        setattr(ref, k, kw[k])
                        #if k not in ['jsondata', 'publisher']:
                        #    print k, ref.pk
                        #    print kw[k]
                        #    print v
                        #    print '--------------'
                        changed = True
                    if ref.title:
                        ref.description = ref.title
            else:
                changed = True
                ref = Ref(**kw)

            def append(attr, obj):
                if obj and obj not in attr:
                    changed = True
                    #
                    # TODO!
                    #
                    attr.append(obj)

            for name in set(filter(None, [s.strip() for s in kw['jsondata'].get('macro_area', '').split(',')])):
                append(ref.macroareas, macroarea_map[name])

            for name in set(filter(None, [s.strip() for s in kw['jsondata'].get('src', '').split(',')])):
                append(ref.providers, provider_map[slug(name)])

            for m in DOCTYPE_PATTERN.finditer(kw['jsondata'].get('hhtype', '')):
                append(ref.doctypes, doctype_map[m.group('name')])

            if len(kw['jsondata'].get('lgcode', '')) == 3:
                kw['jsondata']['lgcode'] = '[%s]' % kw['jsondata']['lgcode']

            for m in CODE_PATTERN.finditer(kw['jsondata'].get('lgcode', '')):
                for code in set(m.group('code').split(',')):
                    if code not in languoid_map:
                        if code not in ['NOCODE_Payagua', 'emx']:
                            print '--> unknown code:', code.encode('utf8')
                    else:
                        append(ref.languages, languoid_map[code])

            for glottocode in filter(None, kw['jsondata'].get('alnumcodes', '').split(';')):
                if glottocode not in languoid_map:
                    print '--> unknown glottocode:', glottocode.encode('utf8')
                else:
                    append(ref.languages, languoid_map[glottocode])

            if not update:
                #pass
                #
                # TODO!
                #
                DBSession.add(ref)

            if i % 100 == 0:
                print i, 'records done'

            if changed:
                count += 1

        print count, 'records updated or imported'
        print skipped, 'records skipped because of lack of information'