Ejemplo n.º 1
0
def main(args):
    client = Client(args.host, 'apics_data', args.user, args.password)

    for layout, table in [
        ('lect_description_references', 'Lect_description_references'),
        ('lect_descriptions', 'Lect_descriptions'),
        ('colours', 'Colours'),
        ('contributors', 'Contributors'),
        ("data (editors' layout)", 'Data'),
        ("data (apics-wals)", 'wals'),
        ("data references", "Data_references"),
        ("editors", "Editors"),
        ("examples", "ExamplesB"),
        ("examples (editors' layout)", "Examples"),
        ("feature references", "Feature_references"),
        ("features (publication)", "Featuresp"),
        ("features (value names)", "Featuresv"),
        ("features", "Features"),
        ("language references", "Language_references"),
        ("languages (editors' layout)", "Languages"),
        ("people", "People"),
        ("references", "References"),
        ("segment data (editors' layout)", "Segment_data"),
        ("segment data", "Segment_dataB"),
        ('segment features', 'Segment_features'),
        ("sociolinguistic data", "Sociolinguistic_data"),
        ("sociolinguistic data references", "Sociolinguistic_data_references"),
        ("sociolinguistic features", "Sociolinguistic_features"),
        ("value examples", "Value_examples"),
    ]:
        jsondump(client.get(layout), args.data_file('fm', '%s.json' % table))
Ejemplo n.º 2
0
    def __call__(self, outdir):
        """
        runs a parser workflow consisting of
        - preprocess
        - refactor
        - postprocess
        writes the results, an html, a css and a json file to disk.
        """
        cssutils_logger = logging.getLogger('CSSUTILS')
        cssutils_logger.setLevel(logging.ERROR)
        print(self.fname.namebase.encode('utf8'))

        with open(self.fname, encoding='utf8') as fp:
            c = fp.read()
        soup = BeautifulSoup(self.preprocess(self._preprocess(c)))

        # extract css from the head section of the HTML doc:
        css = cssutils.parseString('\n')
        for style in soup.find('head').find_all('style'):
            for rule in self.cssrules(style):
                css.add(rule)

        md = dict(outline=[], refs=[], authors=[])
        soup = self.refactor(soup, md)

        # enhance section headings:
        for section, t in tag_and_text(soup.find_all('h3')):
            t = t.split('[Note')[0]
            id_ = 'section-%s' % slug(t)
            md['outline'].append((t, id_))
            section.attrs['id'] = id_
            for s, attrs in [
                (u'\u21eb', {'href': '#top', 'title': 'go to top of the page', 'style': 'vertical-align: bottom'}),
                ('¶', {'class': 'headerlink', 'href': '#' + id_, 'title': 'Permalink to this section'}),
            ]:
                append(section, soup.new_string('\n'), new_tag(soup, 'a', s, **attrs))

        body = self.insert_links(unicode(soup.find('body')), md)

        # write output files:
        with open(outdir.joinpath('%s.html' % self.id), 'w', encoding='utf8') as fp:
            fp.write(self.wrap(self.postprocess(body)))

        with open(outdir.joinpath('%s.css' % self.id), 'wb') as fp:
            fp.write(self.csstext(css))

        md['authors'] = list(self.yield_valid_authors(md['authors']))
        jsondump(md, outdir.joinpath('%s.json' % self.id), indent=4)
Ejemplo n.º 3
0
def main(args):  # pragma: no cover
    stats = Counter(new=0, updated=0, skipped=0)
    changes = {}

    with transaction.manager:
        update_providers(args)
        DBSession.flush()
        provider_map = get_map(Provider)
        macroarea_map = get_map(Macroarea)
        doctype_map = get_map(Doctype)

        languoid_map = {}
        for l in DBSession.query(Languoid):
            if l.hid:
                languoid_map[l.hid] = l
            languoid_map[l.id] = l

        for i, rec in enumerate(get_bib(args)):
            if i and i % 1000 == 0:
                print i, 'records done', stats['updated'] + stats['new'], 'changed'

            if len(rec.keys()) < 6:
                # not enough information!
                stats.update(['skipped'])
                continue

            changed = False
            assert rec.get('glottolog_ref_id')
            id_ = int(rec.get('glottolog_ref_id'))

            ref = DBSession.query(Source).get(id_)
            update = True if ref else False

            kw = {
                'pk': id_,
                'bibtex_type': rec.genre,
                'id': str(id_),
                'jsondata': {'bibtexkey': rec.id},
            }

            for source, target in FIELD_MAP.items():
                if target is None:
                    continue
                value = rec.get(source)
                if value:
                    value = unescape(value)
                    if target:
                        kw[target] = CONVERTER.get(source, lambda x: x)(value)
                    else:
                        kw['jsondata'][source] = value

            if kw['jsondata'].get('hhtype'):
                trigger = ca_trigger(kw['jsondata']['hhtype'])
                if trigger:
                    kw['ca_doctype_trigger'], kw['jsondata']['hhtype'] = trigger

            # try to extract numeric year, startpage, endpage, numberofpages, ...
            if kw.get('year'):
                # prefer years in brackets over the first 4-digit number.
                match = PREF_YEAR_PATTERN.search(kw.get('year'))
                if match:
                    kw['year_int'] = int(match.group('year'))
                else:
                    match = YEAR_PATTERN.search(kw.get('year'))
                    if match:
                        kw['year_int'] = int(match.group('year'))

            if kw.get('publisher'):
                p = kw.get('publisher')
                if ':' in p:
                    address, publisher = [s.strip() for s in kw['publisher'].split(':', 1)]
                    if 'address' not in kw or kw['address'] == address:
                        kw['address'], kw['publisher'] = address, publisher

            if rec.get('numberofpages'):
                try:
                    kw['pages_int'] = int(rec.get('numberofpages').strip())
                except ValueError:
                    pass

            if kw.get('pages'):
                start, end, number = compute_pages(kw['pages'])
                if start is not None:
                    kw['startpage_int'] = start
                if end is not None:
                    kw['endpage_int'] = end
                if number is not None and 'pages_int' not in kw:
                    kw['pages_int'] = number

            for k in kw.keys():
                v = kw[k]
                if isinstance(v, basestring):
                    v = v.strip() or None
                kw[k] = v

            if update:
                for k in kw.keys():
                    if k == 'pk':
                        continue
                    v = getattr(ref, k)
                    if kw[k] != v:
                        if k == 'jsondata':
                            d = {k: v for k, v in ref.jsondata.items()
                                 if k in NONREF_JSONDATA}
                            d.update(**kw[k])
                            ref.jsondata = d
                        else:
                            #print k, '--', v
                            #print k, '++', kw[k]
                            setattr(ref, k, kw[k])
                            changed = True
                            if ref.id in changes:
                                changes[ref.id][k] = ('%s' % v, '%s' % kw[k])
                            else:
                                changes[ref.id] = {k: ('%s' % v, '%s' % kw[k])}
            else:
                changed = True
                ref = Ref(name='%s %s' % (kw.get('author', 'na'), kw.get('year', 'nd')), **kw)

            ref.description = ref.title or ref.booktitle
            originator = ref.author or ref.editor or 'Anonymous'
            ref.name = '%s %s' % (originator, ref.year or 'n.d.')

            a, r = update_relationship(
                ref.macroareas,
                [macroarea_map[name] for name in
                 set(filter(None, [s.strip() for s in kw['jsondata'].get('macro_area', '').split(',')]))])
            changed = changed or a or r

            src = [s.strip() for s in kw['jsondata'].get('src', '').split(',')]
            prv = {provider_map[slug(s)] for s in src if s}
            if set(ref.providers) != prv:
                ref.providers = list(prv)
                changed = True

            a, r = update_relationship(
                ref.doctypes,
                [doctype_map[m.group('name')] for m in
                 DOCTYPE_PATTERN.finditer(kw['jsondata'].get('hhtype', ''))])
            changed = changed or a or r

            if not update:
                stats.update(['new'])
                DBSession.add(ref)
            elif changed:
                stats.update(['updated'])

    args.log.info('%s' % stats)

    DBSession.execute("update source set description = title where description is null and title is not null;")
    DBSession.execute("update source set description = booktitle where description is null and booktitle is not null;")

    for row in list(DBSession.execute(
            "select pk, pages, pages_int, startpage_int from source where pages_int < 0")):
        pk, pages, number, start = row
        _start, _end, _number = compute_pages(pages)
        if _number > 0 and _number != number:
            DBSession.execute(
                "update source set pages_int = %s, startpage_int = %s where pk = %s" %
                (_number, _start, pk))
            DBSession.execute(
                "update ref set endpage_int = %s where pk = %s" %
                (_end, pk))

    jsondump(changes, args.data_dir.joinpath('references', 'changes.json'))
Ejemplo n.º 4
0
def main(args):
    stats = Counter(new=0, matches=0, migrations=0, nomatches=0)
    l, ll = Language.__table__.alias("l"), Languoid.__table__.alias("ll")
    gl_languoids = list(DBSession.execute(select([l, ll], use_labels=True).where(l.c.pk == ll.c.pk)).fetchall())

    # we collect a list of changes which we will store in a JSON file.
    changes = []

    hid_to_pk = {row["ll_hid"]: row["l_pk"] for row in gl_languoids if row["ll_hid"]}
    max_languoid_pk = max(*[row["l_pk"] for row in gl_languoids])
    new_glottocodes = {}
    pk_to_name = {row["l_pk"]: row["l_name"] for row in gl_languoids}

    # dict mapping branches (i.e. tuples of sub-family names) to dicts of H-languages
    hh_families = OrderedDict()

    # dict mapping identifiers (i.e. hid) of H-languages to branches
    hh_languages = OrderedDict()

    parse_families(args.data_dir.joinpath("languoids", "lff.txt"), hh_families, hh_languages)

    # handle isolates / collapse families with exactly one leaf:
    isolate_names = {}
    collapsed_names = {}
    for key in hh_families.keys():
        if len(hh_families[key]) == 1:
            if len(key) == 1:
                # isolate
                hh_languages[hh_families[key].keys()[0]][0] = None
                isolate_names[key[0]] = hh_families[key].keys()[0]  # map name to code
            else:
                hh_languages[hh_families[key].keys()[0]][0] = key[:-1]
                collapsed_names[key[-1]] = hh_families[key].keys()[0]
            del hh_families[key]

    # now add the unclassifiabble, unattested, un-whatever
    parse_families(args.data_dir.joinpath("languoids", "lof.txt"), hh_families, hh_languages)

    # we also want to be able to lookup families by name
    fname_to_branches = defaultdict(list)
    for branch in hh_families:
        fname_to_branches[branch[-1]].append(branch)

    new_hid_to_pk = {}
    for code, (hnode, status, name) in hh_languages.items():
        if code not in hid_to_pk:
            # we have to insert a new H-language!
            max_languoid_pk += 1
            new_hid_to_pk[code] = max_languoid_pk

            if name in pk_to_name.values():
                args.log.warn("new code {1} for existing name {0}".format(name, code))
            changes.append(
                languoid(
                    max_languoid_pk,
                    "language",
                    hid=code,
                    id=glottocode(unicode(name), DBSession, new_glottocodes),
                    name=name,
                    hname=name,
                    status=status,
                )
            )
            stats.update(["new_languages"])

    duplicate_leafset_to_branch = {}
    leafset_to_branch = {}
    for family, langs in hh_families.items():
        leafs = get_leafset(hid for hid in langs.keys() if hid in hid_to_pk)
        if not leafs:
            args.log.info("Family with only new languages: %s, %s" % (family, langs))
            continue

        if leafs in leafset_to_branch:
            # so we have already seen this exact set of leaves.
            #
            # special case: there may be additional "Unclassified something" nodes in
            # branch without any changes in the set of leafs ...
            if not [n for n in family if n.startswith("Unclassified")]:
                # ... or the full leafset contains new languages
                assert [hid for hid in hh_families[family[:-1]].keys() if hid in new_hid_to_pk]
            fset, rset = set(family), set(leafset_to_branch[leafs])
            assert rset.issubset(fset)
            assert leafs not in duplicate_leafset_to_branch
            duplicate_leafset_to_branch[leafs] = family
        else:
            leafset_to_branch[leafs] = family

    #
    # at this point leafset_to_branch is a consolidated mapping of sets of H-Languages
    # to branches in the new family tree.
    #

    # for set comparisons we compute a list of actual sets (not tuples) of leafs
    # ordered by length.
    leafsets = [set(t) for t in sorted(leafset_to_branch.keys(), key=lambda s: len(s))]

    todo = []

    gl_family_to_leafset = {}

    def select_leafs(pk):
        l, tc = Languoid.__table__.alias("l"), TreeClosureTable.__table__.alias("tc")
        return [
            r["l_hid"]
            for r in DBSession.execute(
                select([l, tc], use_labels=True).where(
                    and_(
                        l.c.pk == tc.c.child_pk,
                        l.c.hid != None,
                        l.c.status != LanguoidStatus.provisional,
                        tc.c.parent_pk == pk,
                    )
                )
            )
        ]

    for row in gl_languoids:
        if row["ll_level"] == LanguoidLevel.family and row["l_active"]:
            leafs = get_leafset(select_leafs(row["l_pk"]))
            assert leafs
            glnode = GLNode(
                row["l_pk"], row["l_name"], row["ll_level"].name, row["ll_father_pk"], row["l_jsondata"].get("hname")
            )
            gl_family_to_leafset[glnode] = leafs

    # note: for legacy gl nodes, we map leaf-tuples to lists of matching nodes!
    leafset_to_gl_family = defaultdict(list)
    for node, leafs in gl_family_to_leafset.items():
        leafset_to_gl_family[leafs].append(node)

    # now we look for matches between old and new classification:
    for leafs, nodes in leafset_to_gl_family.items():
        todo.extend(
            match_nodes(args, leafs, nodes, leafset_to_branch, duplicate_leafset_to_branch, leafsets, fname_to_branches)
        )

    # compile a mapping for exact matches:
    branch_to_pk = {}
    for m in todo:
        if m.hid:
            if m.hid in branch_to_pk:
                if branch_to_pk[m.hid] != m.pk:
                    # compare names:
                    if pk_to_name[m.pk] == m.hid[-1]:
                        args.log.info("#### type1")
                        branch_to_pk[m.hid] = m.pk
                    elif pk_to_name[branch_to_pk[m.hid]] == m.hid[-1]:
                        args.log.info("#### type2")
                    else:
                        raise ValueError
            else:
                branch_to_pk[m.hid] = m.pk

    for hnode in sorted(hh_families.keys(), key=lambda b: (len(b), b)):
        # loop through branches breadth first to determine what's to be inserted
        if hnode not in branch_to_pk:
            t = get_leafset(hh_families[hnode].keys())
            if t in leafset_to_gl_family:
                # the "Unclassified subfamily" special case from above:
                if not [n for n in hnode if n.startswith("Unclassified")]:
                    assert [hid for hid in hh_families[hnode[:-1]].keys() if hid in new_hid_to_pk]
                # make sure, the existing glottolog family for the set of leafs is mapped
                # to some other node in the new classification:
                assert leafset_to_gl_family[t][0].pk in [m.pk for m in todo if m.hid]

            max_languoid_pk += 1
            branch_to_pk[hnode] = max_languoid_pk
            pk_to_name[max_languoid_pk] = hnode[-1]
            attrs = languoid(
                max_languoid_pk,
                "family",
                id=glottocode(unicode(hnode[-1]), DBSession, new_glottocodes),
                name=hnode[-1],
                hname=hnode[-1],
            )
            if len(hnode) > 1:
                attrs["father_pk"] = branch_to_pk[tuple(list(hnode)[:-1])]
                assert attrs["father_pk"]
            stats.update(["new"])
            changes.append(attrs)

    # now on to the updates for families:
    for m in todo:
        attrs = languoid(m.pk, "family", name=pk_to_name[m.pk])
        if m.hid:
            stats.update(["matches"])
            if len(m.hid) > 1:
                attrs["father_pk"] = branch_to_pk[tuple(list(m.hid)[:-1])]
            if getattr(m, "rename", False):
                attrs["name"] = m.hid[-1]
            attrs["hname"] = m.hid[-1]
        else:
            attrs["active"] = False  # mark the languoid as obsolete.
            if getattr(m, "pointer", False):
                print "~~", m.pk, pk_to_name[m.pk].encode("utf8"), "->", ", ".join(m.pointer).encode("utf8")
                stats.update(["migrations"])
                attrs["replacement"] = branch_to_pk[m.pointer]
            else:
                stats.update(["nomatches"])
        changes.append(attrs)

    args.log.info("%s" % stats)

    risolate_names = dict(zip(isolate_names.values(), isolate_names.keys()))
    rcollapsed_names = dict(zip(collapsed_names.values(), collapsed_names.keys()))

    # and updates of father_pks for languages:
    for l, (hnode, status, name) in hh_languages.items():
        id_ = hid_to_pk.get(l)
        if not id_:
            id_ = new_hid_to_pk.get(l)
            attrs = languoid(id_, "language", status=status)
        else:
            attrs = languoid(id_, "language", status=status)
            # In case of existing languoids, we don't change the active flag!
            del attrs["active"]
        if id_ in pk_to_name and name != pk_to_name[id_]:
            if slug(pk_to_name[id_]) == slug(name):
                attrs["name"] = name
        if hnode:
            attrs["father_pk"] = branch_to_pk[hnode]
        # look for hnames!
        if l in risolate_names:
            attrs["hname"] = risolate_names[l]
        if l in rcollapsed_names:
            attrs["hname"] = rcollapsed_names[l]
        changes.append(attrs)

    for row in gl_languoids:
        hid = row["ll_hid"]
        if hid and "NOCODE" in hid and hid not in hh_languages:
            # languoids with Harald's private code that are no longer in use
            changes.append(languoid(row["l_pk"], "language", status="retired", active=False, father_pk=None))

    jsondump(changes, args.data_dir.joinpath("languoids", "changes.json"), indent=4)