Ejemplo n.º 1
0
 def create(self, req, filename=None, verbose=True, outfile=None):
     with safe_overwrite(outfile or self.abspath(req)) as tmp:
         if self.rdf:
             # we do not create archives with a readme for rdf downloads, because each
             # RDF entity points to the dataset and the void description of the dataset
             # covers all relevant metadata.
             #
             # TODO: write test for the file name things!?
             #
             with closing(GzipFile(
                 filename=Path(tmp.stem).stem, fileobj=tmp.open('wb')
             )) as fp:
                 self.before(req, fp)
                 for i, item in enumerate(page_query(self.query(req), verbose=verbose)):
                     self.dump(req, fp, item, i)
                 self.after(req, fp)
         else:
             with ZipFile(tmp.as_posix(), 'w', ZIP_DEFLATED) as zipfile:
                 if not filename:
                     fp = self.get_stream()
                     self.before(req, fp)
                     for i, item in enumerate(
                             page_query(self.query(req), verbose=verbose)):
                         self.dump(req, fp, item, i)
                     self.after(req, fp)
                     zipfile.writestr(self.name, self.read_stream(fp))
                 else:  # pragma: no cover
                     zipfile.write(Path(filename).as_posix(), self.name)
                 zipfile.writestr('README.txt', format_readme(req).encode('utf8'))
Ejemplo n.º 2
0
 def create(self, req, filename=None, verbose=True, outfile=None):
     with safe_overwrite(outfile or self.abspath(req)) as tmp:
         if self.rdf:
             # we do not create archives with a readme for rdf downloads, because each
             # RDF entity points to the dataset and the void description of the dataset
             # covers all relevant metadata.
             #
             # TODO: write test for the file name things!?
             #
             with contextlib.closing(gzip.GzipFile(
                 filename=pathlib.Path(tmp.stem).stem, fileobj=tmp.open('wb')
             )) as fp:
                 self.before(req, fp)
                 for i, item in enumerate(page_query(self.query(req), verbose=verbose)):
                     self.dump(req, fp, item, i)
                 self.after(req, fp)
         else:
             with zipfile.ZipFile(tmp.as_posix(), 'w', zipfile.ZIP_DEFLATED) as zipf:
                 if not filename:
                     fp = self.get_stream()
                     self.before(req, fp)
                     for i, item in enumerate(page_query(self.query(req), verbose=verbose)):
                         self.dump(req, fp, item, i)
                     self.after(req, fp)
                     zipf.writestr(self.name, self.read_stream(fp))
                 else:  # pragma: no cover
                     zipf.write(str(pathlib.Path(filename)), self.name)
                 zipf.writestr(
                     'README.txt',
                     format_readme(req, req.db.query(Dataset).first()).encode('utf8'))
Ejemplo n.º 3
0
    def create(self, req, filename=None, verbose=True):
        p = self.abspath(req)
        if not p.dirname().exists():
            p.dirname().mkdir()
        tmp = path('%s.tmp' % p)

        if self.rdf:
            # we do not create archives with a readme for rdf downloads, because each
            # RDF entity points to the dataset and the void description of the dataset
            # covers all relevant metadata.
            with closing(GzipFile(tmp, 'w')) as fp:
                self.before(req, fp)
                for i, item in enumerate(page_query(self.query(req), verbose=verbose)):
                    self.dump(req, fp, item, i)
                self.after(req, fp)
        else:
            with ZipFile(tmp, 'w', ZIP_DEFLATED) as zipfile:
                if not filename:
                    fp = StringIO()
                    self.before(req, fp)
                    for i, item in enumerate(
                            page_query(self.query(req), verbose=verbose)):
                        self.dump(req, fp, item, i)
                    self.after(req, fp)
                    fp.seek(0)
                    zipfile.writestr(self.name, fp.read())
                else:
                    zipfile.write(filename, self.name)
                zipfile.writestr(
                    'README.txt',
                    """
{0} data download
{1}

Data of {0} is published under the following license:
{2}

It should be cited as

{3}
""".format(
                    req.dataset.name,
                    '=' * (len(req.dataset.name.encode('utf8'))
                           + len(' data download')),
                    req.dataset.license,
                    TxtCitation(None).render(req.dataset, req).encode('utf8')))
        if p.exists():
            p.remove()
        tmp.move(p)
Ejemplo n.º 4
0
    def create(self, req, filename=None, verbose=True):
        p = self.abspath(req)
        if not p.dirname().exists():
            p.dirname().mkdir()
        tmp = path('%s.tmp' % p)

        if self.rdf:
            # we do not create archives with a readme for rdf downloads, because each
            # RDF entity points to the dataset and the void description of the dataset
            # covers all relevant metadata.
            with closing(GzipFile(tmp, 'w')) as fp:
                self.before(req, fp)
                for i, item in enumerate(
                        page_query(self.query(req), verbose=verbose)):
                    self.dump(req, fp, item, i)
                self.after(req, fp)
        else:
            with ZipFile(tmp, 'w', ZIP_DEFLATED) as zipfile:
                if not filename:
                    fp = StringIO()
                    self.before(req, fp)
                    for i, item in enumerate(
                            page_query(self.query(req), verbose=verbose)):
                        self.dump(req, fp, item, i)
                    self.after(req, fp)
                    fp.seek(0)
                    zipfile.writestr(self.name, fp.read())
                else:
                    zipfile.write(filename, self.name)
                zipfile.writestr(
                    'README.txt', """
{0} data download
{1}

Data of {0} is published under the following license:
{2}

It should be cited as

{3}
""".format(
                        req.dataset.name,
                        '=' * (len(req.dataset.name.encode('utf8')) +
                               len(' data download')), req.dataset.license,
                        TxtCitation(None).render(req.dataset,
                                                 req).encode('utf8')))
        if p.exists():
            p.remove()
        tmp.move(p)
Ejemplo n.º 5
0
def llod_func(args):  # pragma: no cover
    """Create an RDF dump and compute some statistics about it."""
    tmp = Path(mkdtemp())
    count_rsc = 0
    count_triples = 0

    tmp_dump = tmp.joinpath('rdf.n3')
    with open(as_posix(tmp_dump), 'w') as fp:
        for rsc in RESOURCES:
            args.log.info('Resource type %s ...' % rsc.name)
            try:
                q = DBSession.query(rsc.model)
            except InvalidRequestError:
                args.log.info('... skipping')
                continue
            for obj in page_query(q.order_by(rsc.model.pk), n=10000, verbose=True):
                graph = get_graph(obj, args.env['request'], rsc.name)
                count_triples += len(graph)
                count_rsc += 1
                fp.write(n3(graph, with_head=count_rsc == 1))
            args.log.info('... finished')

    # put in args.data_file('..', 'static', 'download')?
    md = {'path': as_posix(tmp), 'resources': count_rsc, 'triples': count_triples}
    md.update(count_links(as_posix(tmp_dump)))
    jsonlib.dump(md, args.data_file('rdf-metadata.json'))
    print(md)

    dataset = Dataset.first()
    rdf_dump = args.module_dir.joinpath(
        'static', 'download', '%s-dataset.n3' % dataset.id)
    tmp_dump.copy(rdf_dump)
    check_call('gzip -f %s' % rdf_dump, shell=True)
    print(str(rdf_dump))
Ejemplo n.º 6
0
    def get_values(self, p, language_url_pattern):
        q = DBSession.query(Value).join(Value.valueset)\
            .filter(ValueSet.parameter_pk == p.pk)\
            .options(
            joinedload(Value.valueset, ValueSet.language),
            joinedload(Value.valueset, ValueSet.contribution),
            joinedload(Value.domainelement),
            joinedload_all(Value.valueset, ValueSet.references, ValueSetReference.source)
        ).order_by(ValueSet.parameter_pk, ValueSet.language_pk, Value.pk)

        with UnicodeWriter() as writer:
            writer.writerow([
                'ID',
                'Language_ID',
                'Parameter_ID',
                'Contribution_ID',
                'Value',
                'Source',
                'Comment',
            ])
            for v in page_query(q):
                writer.writerow([
                    v.id,
                    language_url_pattern.format(v.valueset.language.id),
                    p.id,
                    v.valueset.contribution.id,
                    v.domainelement.name if v.domainelement else v.name,
                    ';'.join(self.format_sources(v)),
                    getattr(v, 'comment', v.valueset.source) or '',
                ])

        return writer.read()
Ejemplo n.º 7
0
    def create(self, req, filename=None, verbose=True):
        p = self.abspath(req)
        if not p.parent.exists():  # pragma: no cover
            p.parent.mkdir()
        tmp = Path('%s.tmp' % p.as_posix())

        if self.rdf:
            # we do not create archives with a readme for rdf downloads, because each
            # RDF entity points to the dataset and the void description of the dataset
            # covers all relevant metadata.
            #
            # TODO: write test for the file name things!?
            #
            with closing(GzipFile(
                    filename=Path(tmp.stem).stem, fileobj=tmp.open('wb')
            )) as fp:
                self.before(req, fp)
                for i, item in enumerate(page_query(self.query(req), verbose=verbose)):
                    self.dump(req, fp, item, i)
                self.after(req, fp)
        else:
            with ZipFile(tmp.as_posix(), 'w', ZIP_DEFLATED) as zipfile:
                if not filename:
                    fp = self.get_stream()
                    self.before(req, fp)
                    for i, item in enumerate(
                            page_query(self.query(req), verbose=verbose)):
                        self.dump(req, fp, item, i)
                    self.after(req, fp)
                    zipfile.writestr(self.name, self.read_stream(fp))
                else:  # pragma: no cover
                    zipfile.write(filename, self.name)
                zipfile.writestr(
                    'README.txt',
                    README.format(
                        req.dataset.name,
                        '=' * (
                            len(req.dataset.name)
                            + len(' data download')),
                        req.dataset.license,
                        TxtCitation(None).render(req.dataset, req)).encode('utf8'))
        if p.exists():  # pragma: no cover
            remove(p)
        move(tmp, p)
Ejemplo n.º 8
0
    def create(self, req, filename=None, verbose=True):
        p = self.abspath(req)
        if not p.parent.exists():  # pragma: no cover
            p.parent.mkdir()
        tmp = Path('%s.tmp' % p.as_posix())

        if self.rdf:
            # we do not create archives with a readme for rdf downloads, because each
            # RDF entity points to the dataset and the void description of the dataset
            # covers all relevant metadata.
            #
            # TODO: write test for the file name things!?
            #
            with closing(
                    GzipFile(filename=Path(tmp.stem).stem,
                             fileobj=tmp.open('wb'))) as fp:
                self.before(req, fp)
                for i, item in enumerate(
                        page_query(self.query(req), verbose=verbose)):
                    self.dump(req, fp, item, i)
                self.after(req, fp)
        else:
            with ZipFile(tmp.as_posix(), 'w', ZIP_DEFLATED) as zipfile:
                if not filename:
                    fp = self.get_stream()
                    self.before(req, fp)
                    for i, item in enumerate(
                            page_query(self.query(req), verbose=verbose)):
                        self.dump(req, fp, item, i)
                    self.after(req, fp)
                    zipfile.writestr(self.name, self.read_stream(fp))
                else:  # pragma: no cover
                    zipfile.write(filename, self.name)
                zipfile.writestr(
                    'README.txt',
                    README.format(
                        req.dataset.name,
                        '=' * (len(req.dataset.name) + len(' data download')),
                        req.dataset.license,
                        TxtCitation(None).render(req.dataset,
                                                 req)).encode('utf8'))
        if p.exists():  # pragma: no cover
            remove(p)
        move(tmp, p)
Ejemplo n.º 9
0
 def get_languages(self, req, language_url_pattern):
     q = DBSession.query(Language).filter(Language.active == true()).options(
         joinedload(Language.languageidentifier, LanguageIdentifier.identifier))
     for l in page_query(q):
         yield {
             '@id': language_url_pattern.format(l.id),
             'dc:title': l.name,
             'dc:identifier': [
                 {'@id': i.url(), 'schema:name': i.name}
                 for i in l.identifiers if i.url()],
         }
Ejemplo n.º 10
0
def main(args):  # pragma: no cover
    # we merge information about extinct languages from unesco and Harald.
    extinct = dict(list(dsv.reader(args.data_file('extinct.tab'))))
    with transaction.manager:
        query = language_query().options(
            joinedload_all(Language.valuesets, ValueSet.values))
        # loop over active, established languages with geo-coords
        for l in page_query(query, n=100, verbose=True):
            # let's collect the relevant sources in a way that allows computation of med.
            # Note: we limit refs to the ones without computerized assignments.
            sources = DBSession.query(Ref).join(LanguageSource)\
                .filter(LanguageSource.language_pk == l.pk) \
                .filter(Ref.ca_doctype_trigger == None)\
                .filter(Ref.ca_language_trigger == None)\
                .options(joinedload(Ref.doctypes))
            sources = sorted(map(Source, sources))

            # keep the overall med
            # note: this source may not be included in the potential meds computed below,
            # e.g. because it may not have a year.
            med = sources[0].__json__() if sources else None

            # now we have to compute meds respecting a cut-off year.
            # to do so, we collect eligible sources per year and then
            # take the med of this collection.
            potential_meds = []

            # we only have to loop over publication years within all sources, because
            # only in these years something better might have come along.
            for year in set(s.year for s in sources if s.year):
                # let's see if something better was published!
                eligible = [s for s in sources if s.year and s.year <= year]
                if eligible:
                    potential_meds.append(sorted(eligible)[0])

            # we store the precomputed sources information as jsondata:
            l.update_jsondata(
                endangerment='Extinct' if l.hid in extinct else l.endangerment,
                med=med,
                sources=[s.__json__() for s in
                         sorted(set(potential_meds), key=lambda s: -s.year)])
Ejemplo n.º 11
0
def main(args):  # pragma: no cover
    # we merge information about extinct languages from unesco and Harald.
    extinct = dict(list(dsv.reader(args.data_file('extinct.tab'))))
    with transaction.manager:
        query = language_query().options(
            joinedload_all(Language.valuesets, ValueSet.values))
        # loop over active, established languages with geo-coords
        for l in page_query(query, n=100, verbose=True):
            # let's collect the relevant sources in a way that allows computation of med.
            # Note: we limit refs to the ones without computerized assignments.
            sources = DBSession.query(Ref).join(LanguageSource)\
                .filter(LanguageSource.language_pk == l.pk) \
                .filter(Ref.ca_doctype_trigger == None)\
                .filter(Ref.ca_language_trigger == None)\
                .options(joinedload(Ref.doctypes))
            sources = sorted(map(Source, sources))

            # keep the overall med
            # note: this source may not be included in the potential meds computed below,
            # e.g. because it may not have a year.
            med = sources[0].__json__() if sources else None

            # now we have to compute meds respecting a cut-off year.
            # to do so, we collect eligible sources per year and then
            # take the med of this collection.
            potential_meds = []

            # we only have to loop over publication years within all sources, because
            # only in these years something better might have come along.
            for year in set(s.year for s in sources if s.year):
                # let's see if something better was published!
                eligible = [s for s in sources if s.year and s.year <= year]
                if eligible:
                    potential_meds.append(sorted(eligible)[0])

            # we store the precomputed sources information as jsondata:
            l.update_jsondata(
                endangerment='Extinct' if l.hid in extinct else l.endangerment,
                med=med,
                sources=[s.__json__() for s in
                         sorted(set(potential_meds), key=lambda s: -s.year)])
Ejemplo n.º 12
0
def llod_func(args):  # pragma: no cover
    """Create an RDF dump and compute some statistics about it."""
    tmp = Path(mkdtemp())
    count_rsc = 0
    count_triples = 0

    tmp_dump = tmp.joinpath('rdf.n3')
    with open(as_posix(tmp_dump), 'w') as fp:
        for rsc in RESOURCES:
            args.log.info('Resource type %s ...' % rsc.name)
            try:
                q = DBSession.query(rsc.model)
            except InvalidRequestError:
                args.log.info('... skipping')
                continue
            for obj in page_query(q.order_by(rsc.model.pk),
                                  n=10000,
                                  verbose=True):
                graph = get_graph(obj, args.env['request'], rsc.name)
                count_triples += len(graph)
                count_rsc += 1
                fp.write(n3(graph, with_head=count_rsc == 1))
            args.log.info('... finished')

    # put in args.data_file('..', 'static', 'download')?
    md = {
        'path': as_posix(tmp),
        'resources': count_rsc,
        'triples': count_triples
    }
    md.update(count_links(as_posix(tmp_dump)))
    jsonlib.dump(md, args.data_file('rdf-metadata.json'))
    print(md)

    dataset = Dataset.first()
    rdf_dump = args.module_dir.joinpath('static', 'download',
                                        '%s-dataset.n3' % dataset.id)
    tmp_dump.copy(rdf_dump)
    check_call('gzip -f %s' % rdf_dump, shell=True)
    print(str(rdf_dump))
Ejemplo n.º 13
0
def gbs_func(command, args, sources=None):  # pragma: no cover
    def words(s):
        return set(slug(s.strip(), remove_whitespace=False).split())

    log = args.log
    count = 0
    api_url = "https://www.googleapis.com/books/v1/volumes?"

    if command == "cleanup":
        for fname in args.data_file("gbs").glob("*.json"):
            try:
                data = jsonlib.load(fname)
                if data.get("totalItems") == 0:
                    remove(fname)
            except ValueError:
                remove(fname)
        return

    if not sources:
        sources = DBSession.query(common.Source).order_by(common.Source.id).options(joinedload(common.Source.data))
    if callable(sources):
        sources = sources()

    for i, source in enumerate(page_query(sources, verbose=True, commit=True)):
        filepath = args.data_file("gbs", "source%s.json" % source.id)

        if command == "update":
            source.google_book_search_id = None
            source.update_jsondata(gbs={})

        if command in ["verify", "update"]:
            if filepath.exists():
                try:
                    data = jsonlib.load(filepath)
                except ValueError:
                    log.warn("no JSON object found in: %s" % filepath)
                    continue
                if not data["totalItems"]:
                    continue
                item = data["items"][0]
            else:
                continue

        if command == "verify":
            stitle = source.description or source.title or source.booktitle
            needs_check = False
            year = item["volumeInfo"].get("publishedDate", "").split("-")[0]
            if not year or year != slug(source.year or ""):
                needs_check = True
            twords = words(stitle)
            iwords = words(item["volumeInfo"]["title"] + " " + item["volumeInfo"].get("subtitle", ""))
            if (
                twords == iwords
                or (len(iwords) > 2 and iwords.issubset(twords))
                or (len(twords) > 2 and twords.issubset(iwords))
            ):
                needs_check = False
            if int(source.id) == 241:
                log.info("%s" % sorted(words(stitle)))
                log.info("%s" % sorted(iwords))
            if needs_check:
                log.info("------- %s -> %s" % (source.id, item["volumeInfo"].get("industryIdentifiers")))
                log.info("%s %s" % (item["volumeInfo"]["title"], item["volumeInfo"].get("subtitle", "")))
                log.info(stitle)
                log.info(item["volumeInfo"].get("publishedDate"))
                log.info(source.year)
                log.info(item["volumeInfo"].get("authors"))
                log.info(source.author)
                log.info(item["volumeInfo"].get("publisher"))
                log.info(source.publisher)
                if not confirm("Are the records the same?"):
                    log.warn("---- removing ----")
                    jsonlib.dump({"totalItems": 0}, filepath)
        elif command == "update":
            source.google_book_search_id = item["id"]
            source.update_jsondata(gbs=item)
            count += 1
        elif command == "download":
            if source.author and (source.title or source.booktitle):
                title = source.title or source.booktitle
                if filepath.exists():
                    continue
                q = [
                    "inauthor:" + quote_plus(source.author.encode("utf8")),
                    "intitle:" + quote_plus(title.encode("utf8")),
                ]
                if source.publisher:
                    q.append("inpublisher:" + quote_plus(source.publisher.encode("utf8")))
                url = api_url + "q=%s&key=%s" % ("+".join(q), args.api_key)
                count += 1
                r = requests.get(url, headers={"accept": "application/json"})
                log.info("%s - %s" % (r.status_code, url))
                if r.status_code == 200:
                    with open(as_posix(filepath), "w") as fp:
                        fp.write(r.text.encode("utf8"))
                elif r.status_code == 403:
                    log.warn("limit reached")
                    break
    if command == "update":
        log.info("assigned gbs ids for %s out of %s sources" % (count, i))
    elif command == "download":
        log.info("queried gbs for %s sources" % count)
Ejemplo n.º 14
0
Archivo: util.py Proyecto: clld/clld
def gbs_func(command, args, sources=None):  # pragma: no cover
    def words(s):
        return set(slug(s.strip(), remove_whitespace=False).split())

    log = args.log
    count = 0
    api_url = "https://www.googleapis.com/books/v1/volumes?"

    if command == 'cleanup':
        for fname in args.data_file('gbs').glob('*.json'):
            try:
                data = jsonlib.load(fname)
                if data.get('totalItems') == 0:
                    remove(fname)
            except ValueError:
                remove(fname)
        return

    if not sources:
        sources = DBSession.query(common.Source)\
            .order_by(common.Source.id)\
            .options(joinedload(common.Source.data))
    if callable(sources):
        sources = sources()

    for i, source in enumerate(page_query(sources, verbose=True, commit=True)):
        filepath = args.data_file('gbs', 'source%s.json' % source.id)

        if command == 'update':
            source.google_book_search_id = None
            source.update_jsondata(gbs={})

        if command in ['verify', 'update']:
            if filepath.exists():
                try:
                    data = jsonlib.load(filepath)
                except ValueError:
                    log.warn('no JSON object found in: %s' % filepath)
                    continue
                if not data['totalItems']:
                    continue
                item = data['items'][0]
            else:
                continue

        if command == 'verify':
            stitle = source.description or source.title or source.booktitle
            needs_check = False
            year = item['volumeInfo'].get('publishedDate', '').split('-')[0]
            if not year or year != slug(source.year or ''):
                needs_check = True
            twords = words(stitle)
            iwords = words(
                item['volumeInfo']['title'] + ' '
                + item['volumeInfo'].get('subtitle', ''))
            if twords == iwords \
                    or (len(iwords) > 2 and iwords.issubset(twords))\
                    or (len(twords) > 2 and twords.issubset(iwords)):
                needs_check = False
            if int(source.id) == 241:
                log.info('%s' % sorted(words(stitle)))
                log.info('%s' % sorted(iwords))
            if needs_check:
                log.info('------- %s -> %s' % (
                    source.id, item['volumeInfo'].get('industryIdentifiers')))
                log.info('%s %s' % (
                    item['volumeInfo']['title'], item['volumeInfo'].get('subtitle', '')))
                log.info(stitle)
                log.info(item['volumeInfo'].get('publishedDate'))
                log.info(source.year)
                log.info(item['volumeInfo'].get('authors'))
                log.info(source.author)
                log.info(item['volumeInfo'].get('publisher'))
                log.info(source.publisher)
                if not confirm('Are the records the same?'):
                    log.warn('---- removing ----')
                    jsonlib.dump({"totalItems": 0}, filepath)
        elif command == 'update':
            source.google_book_search_id = item['id']
            source.update_jsondata(gbs=item)
            count += 1
        elif command == 'download':
            if source.author and (source.title or source.booktitle):
                title = source.title or source.booktitle
                if filepath.exists():
                    continue
                q = [
                    'inauthor:' + quote_plus(source.author.encode('utf8')),
                    'intitle:' + quote_plus(title.encode('utf8')),
                ]
                if source.publisher:
                    q.append('inpublisher:' + quote_plus(
                        source.publisher.encode('utf8')))
                url = api_url + 'q=%s&key=%s' % ('+'.join(q), args.api_key)
                count += 1
                r = requests.get(url, headers={'accept': 'application/json'})
                log.info('%s - %s' % (r.status_code, url))
                if r.status_code == 200:
                    with open(as_posix(filepath), 'w') as fp:
                        fp.write(r.text.encode('utf8'))
                elif r.status_code == 403:
                    log.warn("limit reached")
                    break
    if command == 'update':
        log.info('assigned gbs ids for %s out of %s sources' % (count, i))
    elif command == 'download':
        log.info('queried gbs for %s sources' % count)
Ejemplo n.º 15
0
def ia_func(command, args, sources=None):  # pragma: no cover
    def words(s):
        return set(slug(s.strip(), remove_whitespace=False).split())

    log = args.log
    count = 0

    if not sources:
        sources = DBSession.query(common.Source)\
            .order_by(common.Source.id)\
            .options(joinedload(common.Source.data))
    else:
        if callable(sources):
            sources = sources()

    i = 0
    for i, source in enumerate(page_query(sources, verbose=True, commit=True)):
        filepath = args.data_file('ia', 'source%s.json' % source.id)

        if command in ['verify', 'update']:
            if filepath.exists():
                with open(filepath) as fp:
                    try:
                        data = json.load(fp)
                    except ValueError:
                        continue
                if not data['response']['numFound']:
                    continue
                item = data['response']['docs'][0]
            else:
                continue

        if command == 'verify':
            stitle = source.description or source.title or source.booktitle
            needs_check = False
            year = text_type(item.get('year', ''))
            if not year or year != slug(source.year or ''):
                needs_check = True
            twords = words(stitle)
            iwords = words(item['title'])
            if twords == iwords \
                    or (len(iwords) > 2 and iwords.issubset(twords))\
                    or (len(twords) > 2 and twords.issubset(iwords)):
                needs_check = False
            if needs_check:
                log.info('------- %s -> %s' % (source.id, item['identifier']))
                log.info(item['title'])
                log.info(stitle)
                log.info(item.get('year'))
                log.info(source.year)
                log.info(item['creator'])
                log.info(source.author)
                if not confirm('Are the records the same?'):
                    log.warn('---- removing ----')
                    with open(filepath, 'w') as fp:
                        json.dump({"response": {'numFound': 0}}, fp)
        elif command == 'update':
            source.update_jsondata(internetarchive_id=item['identifier'])
            count += 1
        elif command == 'download':
            if source.author and (source.title or source.booktitle):
                title = source.title or source.booktitle
                if filepath.exists():
                    continue
                q = quote_plus(b'creator:"%s" AND title:"%s"' % (
                    source.author.split(',')[0].encode('utf8'), title.encode('utf8')))

                count += 1
                r = requests.get(API_URL + q, headers={'accept': 'application/json'})
                log.info('%s - %s' % (r.status_code, r.url))
                if r.status_code == 200:
                    with open(filepath, 'w') as fp:
                        fp.write(r.text.encode('utf8'))
                elif r.status_code == 403:
                    log.warn("limit reached")
                    break
    if command == 'update':
        log.info('assigned internet archive identifiers for %s out of %s sources'
                 % (count, i))
    elif command == 'download':
        log.info('queried internet archive for %s sources' % count)
Ejemplo n.º 16
0
def update_reflang(args):
    stats = Counter()
    brugmann_noderefs = jsonload(args.data_dir.joinpath('languoid_refs.json'))

    languoid_map = {}
    for l in DBSession.query(Languoid).options(joinedload_all(
        Language.languageidentifier, LanguageIdentifier.identifier
    )):
        if l.hid:
            languoid_map[l.hid] = l.pk
        elif l.iso_code:
            languoid_map[l.iso_code] = l.pk
        languoid_map[l.id] = l.pk

    lgcodes = {}
    for rec in get_bib(args):
        lgcode = ''
        for f in 'lgcode lcode lgcde lgcoe lgcosw'.split():
            if rec.get(f):
                lgcode = rec[f]
                break
        if len(lgcode) == 3 or lgcode.startswith('NOCODE_'):
            lgcode = '[' + lgcode + ']'
        lgcodes[rec.get('glottolog_ref_id', None)] = lgcode

    for ref in page_query(
            DBSession.query(Ref).order_by(desc(Source.pk)),
            n=10000,
            commit=True,
            verbose=True):
        # disregard iso change requests:
        if ref.description and ref.description.startswith('Change Request Number '):
            stats.update(['ignored'])
            continue

        if ref.id not in lgcodes:
            # remove all language relations for refs no longer in bib!
            update_relationship(ref.languages, [])
            stats.update(['obsolete'])
            continue

        language_note = lgcodes[ref.id]
        trigger = ca_trigger(language_note)
        if trigger:
            ref.ca_language_trigger, ref.language_note = trigger
        else:
            ref.language_note = language_note

        remove = brugmann_noderefs['delete'].get(str(ref.pk), [])

        # keep relations to non-language languoids:
        # FIXME: adapt this for bib-entries now referring to glottocodes of
        #        families/dialects (e.g. add a sticky-bit to languagesource)
        langs = [
            l for l in ref.languages if
            (l.level != LanguoidLevel.language or not l.active) and l.pk not in remove]
        langs_pk = [l.pk for l in langs]

        # add relations from filemaker data:
        for lpk in brugmann_noderefs['create'].get(str(ref.pk), []):
            if lpk not in langs_pk:
                l = Languoid.get(lpk, default=None)
                if l:
                    langs.append(l)
                    langs_pk.append(l.pk)
                else:
                    args.log.warn('brugmann relation for non-existing languoid %s' % lpk)

        for code in set(get_codes(ref)):
            if code not in languoid_map:
                stats.update([code])
                continue
            lpk = languoid_map[code]
            if lpk in remove:
                print(ref.name, ref.id, '--', l.name, l.id)
                print('relation removed according to brugmann data')
            else:
                if lpk not in langs_pk:
                    langs.append(DBSession.query(Languoid).get(lpk))
                    langs_pk.append(lpk)

        a, r = update_relationship(ref.languages, langs)
        if a or r:
            stats.update(['changed'])

    args.log.info('%s' % stats)
Ejemplo n.º 17
0
def gbs_func(command, args, sources=None):  # pragma: no cover
    def words(s):
        return set(slug(s.strip(), remove_whitespace=False).split())

    log = args.log
    count = 0
    api_url = "https://www.googleapis.com/books/v1/volumes?"

    if command == 'cleanup':
        for fname in args.data_file('gbs').glob('*.json'):
            try:
                data = jsonlib.load(fname)
                if data.get('totalItems') == 0:
                    remove(fname)
            except ValueError:
                remove(fname)
        return

    if not sources:
        sources = DBSession.query(common.Source)\
            .order_by(common.Source.id)\
            .options(joinedload(common.Source.data))
    if callable(sources):
        sources = sources()

    for i, source in enumerate(page_query(sources, verbose=True, commit=True)):
        filepath = args.data_file('gbs', 'source%s.json' % source.id)

        if command == 'update':
            source.google_book_search_id = None
            source.update_jsondata(gbs={})

        if command in ['verify', 'update']:
            if filepath.exists():
                try:
                    data = jsonlib.load(filepath)
                except ValueError:
                    log.warn('no JSON object found in: %s' % filepath)
                    continue
                if not data['totalItems']:
                    continue
                item = data['items'][0]
            else:
                continue

        if command == 'verify':
            stitle = source.description or source.title or source.booktitle
            needs_check = False
            year = item['volumeInfo'].get('publishedDate', '').split('-')[0]
            if not year or year != slug(source.year or ''):
                needs_check = True
            twords = words(stitle)
            iwords = words(item['volumeInfo']['title'] + ' ' +
                           item['volumeInfo'].get('subtitle', ''))
            if twords == iwords \
                    or (len(iwords) > 2 and iwords.issubset(twords))\
                    or (len(twords) > 2 and twords.issubset(iwords)):
                needs_check = False
            if int(source.id) == 241:
                log.info('%s' % sorted(words(stitle)))
                log.info('%s' % sorted(iwords))
            if needs_check:
                log.info(
                    '------- %s -> %s' %
                    (source.id, item['volumeInfo'].get('industryIdentifiers')))
                log.info('%s %s' % (item['volumeInfo']['title'],
                                    item['volumeInfo'].get('subtitle', '')))
                log.info(stitle)
                log.info(item['volumeInfo'].get('publishedDate'))
                log.info(source.year)
                log.info(item['volumeInfo'].get('authors'))
                log.info(source.author)
                log.info(item['volumeInfo'].get('publisher'))
                log.info(source.publisher)
                if not confirm('Are the records the same?'):
                    log.warn('---- removing ----')
                    jsonlib.dump({"totalItems": 0}, filepath)
        elif command == 'update':
            source.google_book_search_id = item['id']
            source.update_jsondata(gbs=item)
            count += 1
        elif command == 'download':
            if source.author and (source.title or source.booktitle):
                title = source.title or source.booktitle
                if filepath.exists():
                    continue
                q = [
                    'inauthor:' + quote_plus(source.author.encode('utf8')),
                    'intitle:' + quote_plus(title.encode('utf8')),
                ]
                if source.publisher:
                    q.append('inpublisher:' +
                             quote_plus(source.publisher.encode('utf8')))
                url = api_url + 'q=%s&key=%s' % ('+'.join(q), args.api_key)
                count += 1
                r = requests.get(url, headers={'accept': 'application/json'})
                log.info('%s - %s' % (r.status_code, url))
                if r.status_code == 200:
                    with open(as_posix(filepath), 'w') as fp:
                        fp.write(r.text.encode('utf8'))
                elif r.status_code == 403:
                    log.warn("limit reached")
                    break
    if command == 'update':
        log.info('assigned gbs ids for %s out of %s sources' % (count, i))
    elif command == 'download':
        log.info('queried gbs for %s sources' % count)
Ejemplo n.º 18
0
def ia_func(command, args, sources=None):  # pragma: no cover
    def words(s):
        return set(slug(s.strip(), remove_whitespace=False).split())

    log = args.log
    count = 0

    if not sources:
        sources = DBSession.query(common.Source)\
            .order_by(common.Source.id)\
            .options(joinedload(common.Source.data))
    else:
        if callable(sources):
            sources = sources()

    i = 0
    for i, source in enumerate(page_query(sources, verbose=True, commit=True)):
        filepath = args.data_file('ia', 'source%s.json' % source.id)

        if command in ['verify', 'update']:
            if filepath.exists():
                with open(filepath) as fp:
                    try:
                        data = json.load(fp)
                    except ValueError:
                        continue
                if not data['response']['numFound']:
                    continue
                item = data['response']['docs'][0]
            else:
                continue

        if command == 'verify':
            stitle = source.description or source.title or source.booktitle
            needs_check = False
            year = text_type(item.get('year', ''))
            if not year or year != slug(source.year or ''):
                needs_check = True
            twords = words(stitle)
            iwords = words(item['title'])
            if twords == iwords \
                    or (len(iwords) > 2 and iwords.issubset(twords))\
                    or (len(twords) > 2 and twords.issubset(iwords)):
                needs_check = False
            if needs_check:
                log.info('------- %s -> %s' % (source.id, item['identifier']))
                log.info(item['title'])
                log.info(stitle)
                log.info(item.get('year'))
                log.info(source.year)
                log.info(item['creator'])
                log.info(source.author)
                if not confirm('Are the records the same?'):
                    log.warn('---- removing ----')
                    with open(filepath, 'w') as fp:
                        json.dump({"response": {'numFound': 0}}, fp)
        elif command == 'update':
            source.update_jsondata(internetarchive_id=item['identifier'])
            count += 1
        elif command == 'download':
            if source.author and (source.title or source.booktitle):
                title = source.title or source.booktitle
                if filepath.exists():
                    continue
                q = quote_plus(b'creator:"%s" AND title:"%s"' %
                               (source.author.split(',')[0].encode('utf8'),
                                title.encode('utf8')))

                count += 1
                r = requests.get(API_URL + q,
                                 headers={'accept': 'application/json'})
                log.info('%s - %s' % (r.status_code, r.url))
                if r.status_code == 200:
                    with open(filepath, 'w') as fp:
                        fp.write(r.text.encode('utf8'))
                elif r.status_code == 403:
                    log.warn("limit reached")
                    break
    if command == 'update':
        log.info(
            'assigned internet archive identifiers for %s out of %s sources' %
            (count, i))
    elif command == 'download':
        log.info('queried internet archive for %s sources' % count)
Ejemplo n.º 19
0
def update_reflang(args):
    with open(args.data_file('brugmann_noderefs.json')) as fp:
        brugmann_noderefs = json.load(fp)

    ignored, obsolete, changed, unknown = 0, 0, 0, {}
    languoid_map = {}
    for l in DBSession.query(Languoid).filter(Languoid.hid != None):
        languoid_map[l.hid] = l.pk

    lgcodes = {}
    for rec in Database.from_file(
            args.data_file(args.version, 'refs.bib'), encoding='utf8'):
        lgcode = ''
        for f in 'lgcode lcode lgcde lgcoe lgcosw'.split():
            if rec.get(f):
                lgcode = rec[f]
                break
        if len(lgcode) == 3 or lgcode.startswith('NOCODE_'):
            lgcode = '[' + lgcode + ']'
        lgcodes[rec.get('glottolog_ref_id', None)] = lgcode

    #for ref in DBSession.query(Ref).order_by(desc(Source.pk)).limit(10000):
    for ref in page_query(
            DBSession.query(Ref).order_by(desc(Source.pk)),
            n=10000,
            commit=True,
            verbose=True):
        # disregard iso change requests:
        if ref.description and ref.description.startswith('Change Request Number '):
            ignored += 1
            continue

        if ref.id not in lgcodes:
            # remove all language relations for refs no longer in bib!
            update_relationship(ref.languages, [])
            obsolete += 1
            continue

        language_note = lgcodes[ref.id]
        trigger = ca_trigger(language_note)
        if trigger:
            ref.ca_language_trigger, ref.language_note = trigger
        else:
            ref.language_note = language_note

        remove = brugmann_noderefs['delete'].get(str(ref.pk), [])

        # keep relations to non-language languoids:
        langs = [
            l for l in ref.languages if
            (l.level != LanguoidLevel.language or not l.active) and l.pk not in remove]
        langs_pk = [l.pk for l in langs]

        # add relations from filemaker data:
        for lpk in brugmann_noderefs['create'].get(str(ref.pk), []):
            if lpk not in langs_pk:
                l = Languoid.get(lpk, default=None)
                if l:
                    #print 'relation added according to brugmann data'
                    langs.append(l)
                    langs_pk.append(l.pk)
                else:
                    print 'brugmann relation for non-existing languoid'

        for code in set(get_codes(ref)):
            if code not in languoid_map:
                unknown[code] = 1
                continue
            lpk = languoid_map[code]
            if lpk in remove:
                print ref.name, ref.id, '--', l.name, l.id
                print 'relation removed according to brugmann data'
            else:
                if lpk not in langs_pk:
                    langs.append(DBSession.query(Languoid).get(lpk))
                    langs_pk.append(lpk)

        a, r = update_relationship(ref.languages, langs)
        if a or r:
            changed += 1

    print ignored, 'ignored'
    print obsolete, 'obsolete'
    print changed, 'changed'
    print 'unknown codes', unknown.keys()
Ejemplo n.º 20
0
def update_reflang(args):
    stats = Counter()
    brugmann_noderefs = jsonload(args.data_dir.joinpath('languoid_refs.json'))

    languoid_map = {}
    for l in DBSession.query(Languoid).options(
            joinedload_all(Language.languageidentifier,
                           LanguageIdentifier.identifier)):
        if l.hid:
            languoid_map[l.hid] = l.pk
        elif l.iso_code:
            languoid_map[l.iso_code] = l.pk
        languoid_map[l.id] = l.pk

    lgcodes = {}
    for rec in get_bib(args):
        lgcode = ''
        for f in 'lgcode lcode lgcde lgcoe lgcosw'.split():
            if rec.get(f):
                lgcode = rec[f]
                break
        if len(lgcode) == 3 or lgcode.startswith('NOCODE_'):
            lgcode = '[' + lgcode + ']'
        lgcodes[rec.get('glottolog_ref_id', None)] = lgcode

    for ref in page_query(DBSession.query(Ref).order_by(desc(Source.pk)),
                          n=10000,
                          commit=True,
                          verbose=True):
        # disregard iso change requests:
        if ref.description and ref.description.startswith(
                'Change Request Number '):
            stats.update(['ignored'])
            continue

        if ref.id not in lgcodes:
            # remove all language relations for refs no longer in bib!
            update_relationship(ref.languages, [])
            stats.update(['obsolete'])
            continue

        language_note = lgcodes[ref.id]
        trigger = ca_trigger(language_note)
        if trigger:
            ref.ca_language_trigger, ref.language_note = trigger
        else:
            ref.language_note = language_note

        remove = brugmann_noderefs['delete'].get(str(ref.pk), [])

        # keep relations to non-language languoids:
        # FIXME: adapt this for bib-entries now referring to glottocodes of
        #        families/dialects (e.g. add a sticky-bit to languagesource)
        langs = [
            l for l in ref.languages
            if (l.level != LanguoidLevel.language or not l.active)
            and l.pk not in remove
        ]
        langs_pk = [l.pk for l in langs]

        # add relations from filemaker data:
        for lpk in brugmann_noderefs['create'].get(str(ref.pk), []):
            if lpk not in langs_pk:
                l = Languoid.get(lpk, default=None)
                if l:
                    langs.append(l)
                    langs_pk.append(l.pk)
                else:
                    args.log.warn(
                        'brugmann relation for non-existing languoid %s' % lpk)

        for code in set(get_codes(ref)):
            if code not in languoid_map:
                stats.update([code])
                continue
            lpk = languoid_map[code]
            if lpk in remove:
                print(ref.name, ref.id, '--', l.name, l.id)
                print('relation removed according to brugmann data')
            else:
                if lpk not in langs_pk:
                    langs.append(DBSession.query(Languoid).get(lpk))
                    langs_pk.append(lpk)

        a, r = update_relationship(ref.languages, langs)
        if a or r:
            stats.update(['changed'])

    args.log.info('%s' % stats)