def create(self, req, filename=None, verbose=True, outfile=None): with safe_overwrite(outfile or self.abspath(req)) as tmp: if self.rdf: # we do not create archives with a readme for rdf downloads, because each # RDF entity points to the dataset and the void description of the dataset # covers all relevant metadata. # # TODO: write test for the file name things!? # with closing(GzipFile( filename=Path(tmp.stem).stem, fileobj=tmp.open('wb') )) as fp: self.before(req, fp) for i, item in enumerate(page_query(self.query(req), verbose=verbose)): self.dump(req, fp, item, i) self.after(req, fp) else: with ZipFile(tmp.as_posix(), 'w', ZIP_DEFLATED) as zipfile: if not filename: fp = self.get_stream() self.before(req, fp) for i, item in enumerate( page_query(self.query(req), verbose=verbose)): self.dump(req, fp, item, i) self.after(req, fp) zipfile.writestr(self.name, self.read_stream(fp)) else: # pragma: no cover zipfile.write(Path(filename).as_posix(), self.name) zipfile.writestr('README.txt', format_readme(req).encode('utf8'))
def create(self, req, filename=None, verbose=True, outfile=None): with safe_overwrite(outfile or self.abspath(req)) as tmp: if self.rdf: # we do not create archives with a readme for rdf downloads, because each # RDF entity points to the dataset and the void description of the dataset # covers all relevant metadata. # # TODO: write test for the file name things!? # with contextlib.closing(gzip.GzipFile( filename=pathlib.Path(tmp.stem).stem, fileobj=tmp.open('wb') )) as fp: self.before(req, fp) for i, item in enumerate(page_query(self.query(req), verbose=verbose)): self.dump(req, fp, item, i) self.after(req, fp) else: with zipfile.ZipFile(tmp.as_posix(), 'w', zipfile.ZIP_DEFLATED) as zipf: if not filename: fp = self.get_stream() self.before(req, fp) for i, item in enumerate(page_query(self.query(req), verbose=verbose)): self.dump(req, fp, item, i) self.after(req, fp) zipf.writestr(self.name, self.read_stream(fp)) else: # pragma: no cover zipf.write(str(pathlib.Path(filename)), self.name) zipf.writestr( 'README.txt', format_readme(req, req.db.query(Dataset).first()).encode('utf8'))
def create(self, req, filename=None, verbose=True): p = self.abspath(req) if not p.dirname().exists(): p.dirname().mkdir() tmp = path('%s.tmp' % p) if self.rdf: # we do not create archives with a readme for rdf downloads, because each # RDF entity points to the dataset and the void description of the dataset # covers all relevant metadata. with closing(GzipFile(tmp, 'w')) as fp: self.before(req, fp) for i, item in enumerate(page_query(self.query(req), verbose=verbose)): self.dump(req, fp, item, i) self.after(req, fp) else: with ZipFile(tmp, 'w', ZIP_DEFLATED) as zipfile: if not filename: fp = StringIO() self.before(req, fp) for i, item in enumerate( page_query(self.query(req), verbose=verbose)): self.dump(req, fp, item, i) self.after(req, fp) fp.seek(0) zipfile.writestr(self.name, fp.read()) else: zipfile.write(filename, self.name) zipfile.writestr( 'README.txt', """ {0} data download {1} Data of {0} is published under the following license: {2} It should be cited as {3} """.format( req.dataset.name, '=' * (len(req.dataset.name.encode('utf8')) + len(' data download')), req.dataset.license, TxtCitation(None).render(req.dataset, req).encode('utf8'))) if p.exists(): p.remove() tmp.move(p)
def create(self, req, filename=None, verbose=True): p = self.abspath(req) if not p.dirname().exists(): p.dirname().mkdir() tmp = path('%s.tmp' % p) if self.rdf: # we do not create archives with a readme for rdf downloads, because each # RDF entity points to the dataset and the void description of the dataset # covers all relevant metadata. with closing(GzipFile(tmp, 'w')) as fp: self.before(req, fp) for i, item in enumerate( page_query(self.query(req), verbose=verbose)): self.dump(req, fp, item, i) self.after(req, fp) else: with ZipFile(tmp, 'w', ZIP_DEFLATED) as zipfile: if not filename: fp = StringIO() self.before(req, fp) for i, item in enumerate( page_query(self.query(req), verbose=verbose)): self.dump(req, fp, item, i) self.after(req, fp) fp.seek(0) zipfile.writestr(self.name, fp.read()) else: zipfile.write(filename, self.name) zipfile.writestr( 'README.txt', """ {0} data download {1} Data of {0} is published under the following license: {2} It should be cited as {3} """.format( req.dataset.name, '=' * (len(req.dataset.name.encode('utf8')) + len(' data download')), req.dataset.license, TxtCitation(None).render(req.dataset, req).encode('utf8'))) if p.exists(): p.remove() tmp.move(p)
def llod_func(args): # pragma: no cover """Create an RDF dump and compute some statistics about it.""" tmp = Path(mkdtemp()) count_rsc = 0 count_triples = 0 tmp_dump = tmp.joinpath('rdf.n3') with open(as_posix(tmp_dump), 'w') as fp: for rsc in RESOURCES: args.log.info('Resource type %s ...' % rsc.name) try: q = DBSession.query(rsc.model) except InvalidRequestError: args.log.info('... skipping') continue for obj in page_query(q.order_by(rsc.model.pk), n=10000, verbose=True): graph = get_graph(obj, args.env['request'], rsc.name) count_triples += len(graph) count_rsc += 1 fp.write(n3(graph, with_head=count_rsc == 1)) args.log.info('... finished') # put in args.data_file('..', 'static', 'download')? md = {'path': as_posix(tmp), 'resources': count_rsc, 'triples': count_triples} md.update(count_links(as_posix(tmp_dump))) jsonlib.dump(md, args.data_file('rdf-metadata.json')) print(md) dataset = Dataset.first() rdf_dump = args.module_dir.joinpath( 'static', 'download', '%s-dataset.n3' % dataset.id) tmp_dump.copy(rdf_dump) check_call('gzip -f %s' % rdf_dump, shell=True) print(str(rdf_dump))
def get_values(self, p, language_url_pattern): q = DBSession.query(Value).join(Value.valueset)\ .filter(ValueSet.parameter_pk == p.pk)\ .options( joinedload(Value.valueset, ValueSet.language), joinedload(Value.valueset, ValueSet.contribution), joinedload(Value.domainelement), joinedload_all(Value.valueset, ValueSet.references, ValueSetReference.source) ).order_by(ValueSet.parameter_pk, ValueSet.language_pk, Value.pk) with UnicodeWriter() as writer: writer.writerow([ 'ID', 'Language_ID', 'Parameter_ID', 'Contribution_ID', 'Value', 'Source', 'Comment', ]) for v in page_query(q): writer.writerow([ v.id, language_url_pattern.format(v.valueset.language.id), p.id, v.valueset.contribution.id, v.domainelement.name if v.domainelement else v.name, ';'.join(self.format_sources(v)), getattr(v, 'comment', v.valueset.source) or '', ]) return writer.read()
def create(self, req, filename=None, verbose=True): p = self.abspath(req) if not p.parent.exists(): # pragma: no cover p.parent.mkdir() tmp = Path('%s.tmp' % p.as_posix()) if self.rdf: # we do not create archives with a readme for rdf downloads, because each # RDF entity points to the dataset and the void description of the dataset # covers all relevant metadata. # # TODO: write test for the file name things!? # with closing(GzipFile( filename=Path(tmp.stem).stem, fileobj=tmp.open('wb') )) as fp: self.before(req, fp) for i, item in enumerate(page_query(self.query(req), verbose=verbose)): self.dump(req, fp, item, i) self.after(req, fp) else: with ZipFile(tmp.as_posix(), 'w', ZIP_DEFLATED) as zipfile: if not filename: fp = self.get_stream() self.before(req, fp) for i, item in enumerate( page_query(self.query(req), verbose=verbose)): self.dump(req, fp, item, i) self.after(req, fp) zipfile.writestr(self.name, self.read_stream(fp)) else: # pragma: no cover zipfile.write(filename, self.name) zipfile.writestr( 'README.txt', README.format( req.dataset.name, '=' * ( len(req.dataset.name) + len(' data download')), req.dataset.license, TxtCitation(None).render(req.dataset, req)).encode('utf8')) if p.exists(): # pragma: no cover remove(p) move(tmp, p)
def create(self, req, filename=None, verbose=True): p = self.abspath(req) if not p.parent.exists(): # pragma: no cover p.parent.mkdir() tmp = Path('%s.tmp' % p.as_posix()) if self.rdf: # we do not create archives with a readme for rdf downloads, because each # RDF entity points to the dataset and the void description of the dataset # covers all relevant metadata. # # TODO: write test for the file name things!? # with closing( GzipFile(filename=Path(tmp.stem).stem, fileobj=tmp.open('wb'))) as fp: self.before(req, fp) for i, item in enumerate( page_query(self.query(req), verbose=verbose)): self.dump(req, fp, item, i) self.after(req, fp) else: with ZipFile(tmp.as_posix(), 'w', ZIP_DEFLATED) as zipfile: if not filename: fp = self.get_stream() self.before(req, fp) for i, item in enumerate( page_query(self.query(req), verbose=verbose)): self.dump(req, fp, item, i) self.after(req, fp) zipfile.writestr(self.name, self.read_stream(fp)) else: # pragma: no cover zipfile.write(filename, self.name) zipfile.writestr( 'README.txt', README.format( req.dataset.name, '=' * (len(req.dataset.name) + len(' data download')), req.dataset.license, TxtCitation(None).render(req.dataset, req)).encode('utf8')) if p.exists(): # pragma: no cover remove(p) move(tmp, p)
def get_languages(self, req, language_url_pattern): q = DBSession.query(Language).filter(Language.active == true()).options( joinedload(Language.languageidentifier, LanguageIdentifier.identifier)) for l in page_query(q): yield { '@id': language_url_pattern.format(l.id), 'dc:title': l.name, 'dc:identifier': [ {'@id': i.url(), 'schema:name': i.name} for i in l.identifiers if i.url()], }
def main(args): # pragma: no cover # we merge information about extinct languages from unesco and Harald. extinct = dict(list(dsv.reader(args.data_file('extinct.tab')))) with transaction.manager: query = language_query().options( joinedload_all(Language.valuesets, ValueSet.values)) # loop over active, established languages with geo-coords for l in page_query(query, n=100, verbose=True): # let's collect the relevant sources in a way that allows computation of med. # Note: we limit refs to the ones without computerized assignments. sources = DBSession.query(Ref).join(LanguageSource)\ .filter(LanguageSource.language_pk == l.pk) \ .filter(Ref.ca_doctype_trigger == None)\ .filter(Ref.ca_language_trigger == None)\ .options(joinedload(Ref.doctypes)) sources = sorted(map(Source, sources)) # keep the overall med # note: this source may not be included in the potential meds computed below, # e.g. because it may not have a year. med = sources[0].__json__() if sources else None # now we have to compute meds respecting a cut-off year. # to do so, we collect eligible sources per year and then # take the med of this collection. potential_meds = [] # we only have to loop over publication years within all sources, because # only in these years something better might have come along. for year in set(s.year for s in sources if s.year): # let's see if something better was published! eligible = [s for s in sources if s.year and s.year <= year] if eligible: potential_meds.append(sorted(eligible)[0]) # we store the precomputed sources information as jsondata: l.update_jsondata( endangerment='Extinct' if l.hid in extinct else l.endangerment, med=med, sources=[s.__json__() for s in sorted(set(potential_meds), key=lambda s: -s.year)])
def llod_func(args): # pragma: no cover """Create an RDF dump and compute some statistics about it.""" tmp = Path(mkdtemp()) count_rsc = 0 count_triples = 0 tmp_dump = tmp.joinpath('rdf.n3') with open(as_posix(tmp_dump), 'w') as fp: for rsc in RESOURCES: args.log.info('Resource type %s ...' % rsc.name) try: q = DBSession.query(rsc.model) except InvalidRequestError: args.log.info('... skipping') continue for obj in page_query(q.order_by(rsc.model.pk), n=10000, verbose=True): graph = get_graph(obj, args.env['request'], rsc.name) count_triples += len(graph) count_rsc += 1 fp.write(n3(graph, with_head=count_rsc == 1)) args.log.info('... finished') # put in args.data_file('..', 'static', 'download')? md = { 'path': as_posix(tmp), 'resources': count_rsc, 'triples': count_triples } md.update(count_links(as_posix(tmp_dump))) jsonlib.dump(md, args.data_file('rdf-metadata.json')) print(md) dataset = Dataset.first() rdf_dump = args.module_dir.joinpath('static', 'download', '%s-dataset.n3' % dataset.id) tmp_dump.copy(rdf_dump) check_call('gzip -f %s' % rdf_dump, shell=True) print(str(rdf_dump))
def gbs_func(command, args, sources=None): # pragma: no cover def words(s): return set(slug(s.strip(), remove_whitespace=False).split()) log = args.log count = 0 api_url = "https://www.googleapis.com/books/v1/volumes?" if command == "cleanup": for fname in args.data_file("gbs").glob("*.json"): try: data = jsonlib.load(fname) if data.get("totalItems") == 0: remove(fname) except ValueError: remove(fname) return if not sources: sources = DBSession.query(common.Source).order_by(common.Source.id).options(joinedload(common.Source.data)) if callable(sources): sources = sources() for i, source in enumerate(page_query(sources, verbose=True, commit=True)): filepath = args.data_file("gbs", "source%s.json" % source.id) if command == "update": source.google_book_search_id = None source.update_jsondata(gbs={}) if command in ["verify", "update"]: if filepath.exists(): try: data = jsonlib.load(filepath) except ValueError: log.warn("no JSON object found in: %s" % filepath) continue if not data["totalItems"]: continue item = data["items"][0] else: continue if command == "verify": stitle = source.description or source.title or source.booktitle needs_check = False year = item["volumeInfo"].get("publishedDate", "").split("-")[0] if not year or year != slug(source.year or ""): needs_check = True twords = words(stitle) iwords = words(item["volumeInfo"]["title"] + " " + item["volumeInfo"].get("subtitle", "")) if ( twords == iwords or (len(iwords) > 2 and iwords.issubset(twords)) or (len(twords) > 2 and twords.issubset(iwords)) ): needs_check = False if int(source.id) == 241: log.info("%s" % sorted(words(stitle))) log.info("%s" % sorted(iwords)) if needs_check: log.info("------- %s -> %s" % (source.id, item["volumeInfo"].get("industryIdentifiers"))) log.info("%s %s" % (item["volumeInfo"]["title"], item["volumeInfo"].get("subtitle", ""))) log.info(stitle) log.info(item["volumeInfo"].get("publishedDate")) log.info(source.year) log.info(item["volumeInfo"].get("authors")) log.info(source.author) log.info(item["volumeInfo"].get("publisher")) log.info(source.publisher) if not confirm("Are the records the same?"): log.warn("---- removing ----") jsonlib.dump({"totalItems": 0}, filepath) elif command == "update": source.google_book_search_id = item["id"] source.update_jsondata(gbs=item) count += 1 elif command == "download": if source.author and (source.title or source.booktitle): title = source.title or source.booktitle if filepath.exists(): continue q = [ "inauthor:" + quote_plus(source.author.encode("utf8")), "intitle:" + quote_plus(title.encode("utf8")), ] if source.publisher: q.append("inpublisher:" + quote_plus(source.publisher.encode("utf8"))) url = api_url + "q=%s&key=%s" % ("+".join(q), args.api_key) count += 1 r = requests.get(url, headers={"accept": "application/json"}) log.info("%s - %s" % (r.status_code, url)) if r.status_code == 200: with open(as_posix(filepath), "w") as fp: fp.write(r.text.encode("utf8")) elif r.status_code == 403: log.warn("limit reached") break if command == "update": log.info("assigned gbs ids for %s out of %s sources" % (count, i)) elif command == "download": log.info("queried gbs for %s sources" % count)
def gbs_func(command, args, sources=None): # pragma: no cover def words(s): return set(slug(s.strip(), remove_whitespace=False).split()) log = args.log count = 0 api_url = "https://www.googleapis.com/books/v1/volumes?" if command == 'cleanup': for fname in args.data_file('gbs').glob('*.json'): try: data = jsonlib.load(fname) if data.get('totalItems') == 0: remove(fname) except ValueError: remove(fname) return if not sources: sources = DBSession.query(common.Source)\ .order_by(common.Source.id)\ .options(joinedload(common.Source.data)) if callable(sources): sources = sources() for i, source in enumerate(page_query(sources, verbose=True, commit=True)): filepath = args.data_file('gbs', 'source%s.json' % source.id) if command == 'update': source.google_book_search_id = None source.update_jsondata(gbs={}) if command in ['verify', 'update']: if filepath.exists(): try: data = jsonlib.load(filepath) except ValueError: log.warn('no JSON object found in: %s' % filepath) continue if not data['totalItems']: continue item = data['items'][0] else: continue if command == 'verify': stitle = source.description or source.title or source.booktitle needs_check = False year = item['volumeInfo'].get('publishedDate', '').split('-')[0] if not year or year != slug(source.year or ''): needs_check = True twords = words(stitle) iwords = words( item['volumeInfo']['title'] + ' ' + item['volumeInfo'].get('subtitle', '')) if twords == iwords \ or (len(iwords) > 2 and iwords.issubset(twords))\ or (len(twords) > 2 and twords.issubset(iwords)): needs_check = False if int(source.id) == 241: log.info('%s' % sorted(words(stitle))) log.info('%s' % sorted(iwords)) if needs_check: log.info('------- %s -> %s' % ( source.id, item['volumeInfo'].get('industryIdentifiers'))) log.info('%s %s' % ( item['volumeInfo']['title'], item['volumeInfo'].get('subtitle', ''))) log.info(stitle) log.info(item['volumeInfo'].get('publishedDate')) log.info(source.year) log.info(item['volumeInfo'].get('authors')) log.info(source.author) log.info(item['volumeInfo'].get('publisher')) log.info(source.publisher) if not confirm('Are the records the same?'): log.warn('---- removing ----') jsonlib.dump({"totalItems": 0}, filepath) elif command == 'update': source.google_book_search_id = item['id'] source.update_jsondata(gbs=item) count += 1 elif command == 'download': if source.author and (source.title or source.booktitle): title = source.title or source.booktitle if filepath.exists(): continue q = [ 'inauthor:' + quote_plus(source.author.encode('utf8')), 'intitle:' + quote_plus(title.encode('utf8')), ] if source.publisher: q.append('inpublisher:' + quote_plus( source.publisher.encode('utf8'))) url = api_url + 'q=%s&key=%s' % ('+'.join(q), args.api_key) count += 1 r = requests.get(url, headers={'accept': 'application/json'}) log.info('%s - %s' % (r.status_code, url)) if r.status_code == 200: with open(as_posix(filepath), 'w') as fp: fp.write(r.text.encode('utf8')) elif r.status_code == 403: log.warn("limit reached") break if command == 'update': log.info('assigned gbs ids for %s out of %s sources' % (count, i)) elif command == 'download': log.info('queried gbs for %s sources' % count)
def ia_func(command, args, sources=None): # pragma: no cover def words(s): return set(slug(s.strip(), remove_whitespace=False).split()) log = args.log count = 0 if not sources: sources = DBSession.query(common.Source)\ .order_by(common.Source.id)\ .options(joinedload(common.Source.data)) else: if callable(sources): sources = sources() i = 0 for i, source in enumerate(page_query(sources, verbose=True, commit=True)): filepath = args.data_file('ia', 'source%s.json' % source.id) if command in ['verify', 'update']: if filepath.exists(): with open(filepath) as fp: try: data = json.load(fp) except ValueError: continue if not data['response']['numFound']: continue item = data['response']['docs'][0] else: continue if command == 'verify': stitle = source.description or source.title or source.booktitle needs_check = False year = text_type(item.get('year', '')) if not year or year != slug(source.year or ''): needs_check = True twords = words(stitle) iwords = words(item['title']) if twords == iwords \ or (len(iwords) > 2 and iwords.issubset(twords))\ or (len(twords) > 2 and twords.issubset(iwords)): needs_check = False if needs_check: log.info('------- %s -> %s' % (source.id, item['identifier'])) log.info(item['title']) log.info(stitle) log.info(item.get('year')) log.info(source.year) log.info(item['creator']) log.info(source.author) if not confirm('Are the records the same?'): log.warn('---- removing ----') with open(filepath, 'w') as fp: json.dump({"response": {'numFound': 0}}, fp) elif command == 'update': source.update_jsondata(internetarchive_id=item['identifier']) count += 1 elif command == 'download': if source.author and (source.title or source.booktitle): title = source.title or source.booktitle if filepath.exists(): continue q = quote_plus(b'creator:"%s" AND title:"%s"' % ( source.author.split(',')[0].encode('utf8'), title.encode('utf8'))) count += 1 r = requests.get(API_URL + q, headers={'accept': 'application/json'}) log.info('%s - %s' % (r.status_code, r.url)) if r.status_code == 200: with open(filepath, 'w') as fp: fp.write(r.text.encode('utf8')) elif r.status_code == 403: log.warn("limit reached") break if command == 'update': log.info('assigned internet archive identifiers for %s out of %s sources' % (count, i)) elif command == 'download': log.info('queried internet archive for %s sources' % count)
def update_reflang(args): stats = Counter() brugmann_noderefs = jsonload(args.data_dir.joinpath('languoid_refs.json')) languoid_map = {} for l in DBSession.query(Languoid).options(joinedload_all( Language.languageidentifier, LanguageIdentifier.identifier )): if l.hid: languoid_map[l.hid] = l.pk elif l.iso_code: languoid_map[l.iso_code] = l.pk languoid_map[l.id] = l.pk lgcodes = {} for rec in get_bib(args): lgcode = '' for f in 'lgcode lcode lgcde lgcoe lgcosw'.split(): if rec.get(f): lgcode = rec[f] break if len(lgcode) == 3 or lgcode.startswith('NOCODE_'): lgcode = '[' + lgcode + ']' lgcodes[rec.get('glottolog_ref_id', None)] = lgcode for ref in page_query( DBSession.query(Ref).order_by(desc(Source.pk)), n=10000, commit=True, verbose=True): # disregard iso change requests: if ref.description and ref.description.startswith('Change Request Number '): stats.update(['ignored']) continue if ref.id not in lgcodes: # remove all language relations for refs no longer in bib! update_relationship(ref.languages, []) stats.update(['obsolete']) continue language_note = lgcodes[ref.id] trigger = ca_trigger(language_note) if trigger: ref.ca_language_trigger, ref.language_note = trigger else: ref.language_note = language_note remove = brugmann_noderefs['delete'].get(str(ref.pk), []) # keep relations to non-language languoids: # FIXME: adapt this for bib-entries now referring to glottocodes of # families/dialects (e.g. add a sticky-bit to languagesource) langs = [ l for l in ref.languages if (l.level != LanguoidLevel.language or not l.active) and l.pk not in remove] langs_pk = [l.pk for l in langs] # add relations from filemaker data: for lpk in brugmann_noderefs['create'].get(str(ref.pk), []): if lpk not in langs_pk: l = Languoid.get(lpk, default=None) if l: langs.append(l) langs_pk.append(l.pk) else: args.log.warn('brugmann relation for non-existing languoid %s' % lpk) for code in set(get_codes(ref)): if code not in languoid_map: stats.update([code]) continue lpk = languoid_map[code] if lpk in remove: print(ref.name, ref.id, '--', l.name, l.id) print('relation removed according to brugmann data') else: if lpk not in langs_pk: langs.append(DBSession.query(Languoid).get(lpk)) langs_pk.append(lpk) a, r = update_relationship(ref.languages, langs) if a or r: stats.update(['changed']) args.log.info('%s' % stats)
def gbs_func(command, args, sources=None): # pragma: no cover def words(s): return set(slug(s.strip(), remove_whitespace=False).split()) log = args.log count = 0 api_url = "https://www.googleapis.com/books/v1/volumes?" if command == 'cleanup': for fname in args.data_file('gbs').glob('*.json'): try: data = jsonlib.load(fname) if data.get('totalItems') == 0: remove(fname) except ValueError: remove(fname) return if not sources: sources = DBSession.query(common.Source)\ .order_by(common.Source.id)\ .options(joinedload(common.Source.data)) if callable(sources): sources = sources() for i, source in enumerate(page_query(sources, verbose=True, commit=True)): filepath = args.data_file('gbs', 'source%s.json' % source.id) if command == 'update': source.google_book_search_id = None source.update_jsondata(gbs={}) if command in ['verify', 'update']: if filepath.exists(): try: data = jsonlib.load(filepath) except ValueError: log.warn('no JSON object found in: %s' % filepath) continue if not data['totalItems']: continue item = data['items'][0] else: continue if command == 'verify': stitle = source.description or source.title or source.booktitle needs_check = False year = item['volumeInfo'].get('publishedDate', '').split('-')[0] if not year or year != slug(source.year or ''): needs_check = True twords = words(stitle) iwords = words(item['volumeInfo']['title'] + ' ' + item['volumeInfo'].get('subtitle', '')) if twords == iwords \ or (len(iwords) > 2 and iwords.issubset(twords))\ or (len(twords) > 2 and twords.issubset(iwords)): needs_check = False if int(source.id) == 241: log.info('%s' % sorted(words(stitle))) log.info('%s' % sorted(iwords)) if needs_check: log.info( '------- %s -> %s' % (source.id, item['volumeInfo'].get('industryIdentifiers'))) log.info('%s %s' % (item['volumeInfo']['title'], item['volumeInfo'].get('subtitle', ''))) log.info(stitle) log.info(item['volumeInfo'].get('publishedDate')) log.info(source.year) log.info(item['volumeInfo'].get('authors')) log.info(source.author) log.info(item['volumeInfo'].get('publisher')) log.info(source.publisher) if not confirm('Are the records the same?'): log.warn('---- removing ----') jsonlib.dump({"totalItems": 0}, filepath) elif command == 'update': source.google_book_search_id = item['id'] source.update_jsondata(gbs=item) count += 1 elif command == 'download': if source.author and (source.title or source.booktitle): title = source.title or source.booktitle if filepath.exists(): continue q = [ 'inauthor:' + quote_plus(source.author.encode('utf8')), 'intitle:' + quote_plus(title.encode('utf8')), ] if source.publisher: q.append('inpublisher:' + quote_plus(source.publisher.encode('utf8'))) url = api_url + 'q=%s&key=%s' % ('+'.join(q), args.api_key) count += 1 r = requests.get(url, headers={'accept': 'application/json'}) log.info('%s - %s' % (r.status_code, url)) if r.status_code == 200: with open(as_posix(filepath), 'w') as fp: fp.write(r.text.encode('utf8')) elif r.status_code == 403: log.warn("limit reached") break if command == 'update': log.info('assigned gbs ids for %s out of %s sources' % (count, i)) elif command == 'download': log.info('queried gbs for %s sources' % count)
def ia_func(command, args, sources=None): # pragma: no cover def words(s): return set(slug(s.strip(), remove_whitespace=False).split()) log = args.log count = 0 if not sources: sources = DBSession.query(common.Source)\ .order_by(common.Source.id)\ .options(joinedload(common.Source.data)) else: if callable(sources): sources = sources() i = 0 for i, source in enumerate(page_query(sources, verbose=True, commit=True)): filepath = args.data_file('ia', 'source%s.json' % source.id) if command in ['verify', 'update']: if filepath.exists(): with open(filepath) as fp: try: data = json.load(fp) except ValueError: continue if not data['response']['numFound']: continue item = data['response']['docs'][0] else: continue if command == 'verify': stitle = source.description or source.title or source.booktitle needs_check = False year = text_type(item.get('year', '')) if not year or year != slug(source.year or ''): needs_check = True twords = words(stitle) iwords = words(item['title']) if twords == iwords \ or (len(iwords) > 2 and iwords.issubset(twords))\ or (len(twords) > 2 and twords.issubset(iwords)): needs_check = False if needs_check: log.info('------- %s -> %s' % (source.id, item['identifier'])) log.info(item['title']) log.info(stitle) log.info(item.get('year')) log.info(source.year) log.info(item['creator']) log.info(source.author) if not confirm('Are the records the same?'): log.warn('---- removing ----') with open(filepath, 'w') as fp: json.dump({"response": {'numFound': 0}}, fp) elif command == 'update': source.update_jsondata(internetarchive_id=item['identifier']) count += 1 elif command == 'download': if source.author and (source.title or source.booktitle): title = source.title or source.booktitle if filepath.exists(): continue q = quote_plus(b'creator:"%s" AND title:"%s"' % (source.author.split(',')[0].encode('utf8'), title.encode('utf8'))) count += 1 r = requests.get(API_URL + q, headers={'accept': 'application/json'}) log.info('%s - %s' % (r.status_code, r.url)) if r.status_code == 200: with open(filepath, 'w') as fp: fp.write(r.text.encode('utf8')) elif r.status_code == 403: log.warn("limit reached") break if command == 'update': log.info( 'assigned internet archive identifiers for %s out of %s sources' % (count, i)) elif command == 'download': log.info('queried internet archive for %s sources' % count)
def update_reflang(args): with open(args.data_file('brugmann_noderefs.json')) as fp: brugmann_noderefs = json.load(fp) ignored, obsolete, changed, unknown = 0, 0, 0, {} languoid_map = {} for l in DBSession.query(Languoid).filter(Languoid.hid != None): languoid_map[l.hid] = l.pk lgcodes = {} for rec in Database.from_file( args.data_file(args.version, 'refs.bib'), encoding='utf8'): lgcode = '' for f in 'lgcode lcode lgcde lgcoe lgcosw'.split(): if rec.get(f): lgcode = rec[f] break if len(lgcode) == 3 or lgcode.startswith('NOCODE_'): lgcode = '[' + lgcode + ']' lgcodes[rec.get('glottolog_ref_id', None)] = lgcode #for ref in DBSession.query(Ref).order_by(desc(Source.pk)).limit(10000): for ref in page_query( DBSession.query(Ref).order_by(desc(Source.pk)), n=10000, commit=True, verbose=True): # disregard iso change requests: if ref.description and ref.description.startswith('Change Request Number '): ignored += 1 continue if ref.id not in lgcodes: # remove all language relations for refs no longer in bib! update_relationship(ref.languages, []) obsolete += 1 continue language_note = lgcodes[ref.id] trigger = ca_trigger(language_note) if trigger: ref.ca_language_trigger, ref.language_note = trigger else: ref.language_note = language_note remove = brugmann_noderefs['delete'].get(str(ref.pk), []) # keep relations to non-language languoids: langs = [ l for l in ref.languages if (l.level != LanguoidLevel.language or not l.active) and l.pk not in remove] langs_pk = [l.pk for l in langs] # add relations from filemaker data: for lpk in brugmann_noderefs['create'].get(str(ref.pk), []): if lpk not in langs_pk: l = Languoid.get(lpk, default=None) if l: #print 'relation added according to brugmann data' langs.append(l) langs_pk.append(l.pk) else: print 'brugmann relation for non-existing languoid' for code in set(get_codes(ref)): if code not in languoid_map: unknown[code] = 1 continue lpk = languoid_map[code] if lpk in remove: print ref.name, ref.id, '--', l.name, l.id print 'relation removed according to brugmann data' else: if lpk not in langs_pk: langs.append(DBSession.query(Languoid).get(lpk)) langs_pk.append(lpk) a, r = update_relationship(ref.languages, langs) if a or r: changed += 1 print ignored, 'ignored' print obsolete, 'obsolete' print changed, 'changed' print 'unknown codes', unknown.keys()
def update_reflang(args): stats = Counter() brugmann_noderefs = jsonload(args.data_dir.joinpath('languoid_refs.json')) languoid_map = {} for l in DBSession.query(Languoid).options( joinedload_all(Language.languageidentifier, LanguageIdentifier.identifier)): if l.hid: languoid_map[l.hid] = l.pk elif l.iso_code: languoid_map[l.iso_code] = l.pk languoid_map[l.id] = l.pk lgcodes = {} for rec in get_bib(args): lgcode = '' for f in 'lgcode lcode lgcde lgcoe lgcosw'.split(): if rec.get(f): lgcode = rec[f] break if len(lgcode) == 3 or lgcode.startswith('NOCODE_'): lgcode = '[' + lgcode + ']' lgcodes[rec.get('glottolog_ref_id', None)] = lgcode for ref in page_query(DBSession.query(Ref).order_by(desc(Source.pk)), n=10000, commit=True, verbose=True): # disregard iso change requests: if ref.description and ref.description.startswith( 'Change Request Number '): stats.update(['ignored']) continue if ref.id not in lgcodes: # remove all language relations for refs no longer in bib! update_relationship(ref.languages, []) stats.update(['obsolete']) continue language_note = lgcodes[ref.id] trigger = ca_trigger(language_note) if trigger: ref.ca_language_trigger, ref.language_note = trigger else: ref.language_note = language_note remove = brugmann_noderefs['delete'].get(str(ref.pk), []) # keep relations to non-language languoids: # FIXME: adapt this for bib-entries now referring to glottocodes of # families/dialects (e.g. add a sticky-bit to languagesource) langs = [ l for l in ref.languages if (l.level != LanguoidLevel.language or not l.active) and l.pk not in remove ] langs_pk = [l.pk for l in langs] # add relations from filemaker data: for lpk in brugmann_noderefs['create'].get(str(ref.pk), []): if lpk not in langs_pk: l = Languoid.get(lpk, default=None) if l: langs.append(l) langs_pk.append(l.pk) else: args.log.warn( 'brugmann relation for non-existing languoid %s' % lpk) for code in set(get_codes(ref)): if code not in languoid_map: stats.update([code]) continue lpk = languoid_map[code] if lpk in remove: print(ref.name, ref.id, '--', l.name, l.id) print('relation removed according to brugmann data') else: if lpk not in langs_pk: langs.append(DBSession.query(Languoid).get(lpk)) langs_pk.append(lpk) a, r = update_relationship(ref.languages, langs) if a or r: stats.update(['changed']) args.log.info('%s' % stats)