def run(self): if not self.isil == 'DE-15': raise RuntimeError('not implemented except for DE-15') cache = URLCache(directory=os.path.join(tempfile.gettempdir(), '.urlcache')) adapter = requests.adapters.HTTPAdapter(max_retries=3) cache.sess.mount('http://', adapter) finc = self.config.get('ai', 'finc-solr') ai = self.config.get('ai', 'ai-solr') with self.input().open() as handle: with self.output().open('w') as output: for i, row in enumerate(handle.iter_tsv(cols=('issn', 'status'))): if row.status == 'NOT_FOUND': link = '%s/select?q=institution:%s+AND+issn:%s&wt=json' % (finc, self.isil, row.issn) self.logger.info('fetch #%05d: %s' % (i, link)) body = cache.get(link) content = json.loads(body) output.write_tsv('finc', row.issn, content['response']['numFound'], link) else: link = '%s/select?q=institution:%s+AND+issn:%s&wt=json' % (ai, self.isil, row.issn) self.logger.info('fetch #%05d: %s' % (i, link)) body = cache.get(link) content = json.loads(body) output.write_tsv('ai', row.issn, content['response']['numFound'], link)
def run(self): cache = URLCache(directory=os.path.join(tempfile.gettempdir(), '.urlcache')) adapter = requests.adapters.HTTPAdapter(max_retries=self.max_retries) cache.sess.mount('http://', adapter) filter = "from-{self.filter}-date:{self.begin},until-{self.filter}-date:{self.end}".format(self=self) rows, offset = self.rows, 0 with self.output().open('w') as output: while True: params = {"rows": rows, "offset": offset, "filter": filter} url = 'http://api.crossref.org/works?%s' % (urllib.urlencode(params)) for attempt in range(1, 3): body = cache.get(url) try: content = json.loads(body) except ValueError as err: if attempt == 2: self.logger.debug("URL was %s" % url) self.logger.debug(err) self.logger.debug(body[:100]) raise if os.path.exists(cache.get_cache_file(url)): self.logger.debug("trying to recover by removing cached entry") os.remove(cache.get_cache_file(url)) else: break items = content["message"]["items"] self.logger.debug("%s: %s" % (url, len(items))) if len(items) == 0: break output.write(body + "\n") offset += rows
def run(self): if not self.isil == 'DE-15': raise RuntimeError('not implemented except for DE-15') cache = URLCache(directory=os.path.join(tempfile.gettempdir(), '.urlcache')) adapter = requests.adapters.HTTPAdapter(max_retries=3) cache.sess.mount('http://', adapter) with self.input().open() as handle: with self.output().open('w') as output: for i, row in enumerate(handle.iter_tsv(cols=('issn', 'status'))): if row.status == 'NOT_FOUND': link = 'https://katalog.ub.uni-leipzig.de/Search/Results?lookfor=%s&type=ISN' % row.issn self.logger.info('fetch #%05d: %s' % (i, link)) body = cache.get(link) if 'Keine Ergebnisse!' in body: output.write_tsv(row.issn, 'ERR_NOT_IN_CATALOG', link) else: soup = BeautifulSoup(body) rs = soup.findAll("div", {"class" : "floatleft"}) if len(rs) == 0: output.write_tsv(row.issn, 'ERR_LAYOUT', link) continue first = rs[0] match = re.search(r'Treffer([0-9]+)-([0-9]+)von([0-9]+)', first.text) if match: total = match.group(3) output.write_tsv(row.issn, 'FOUND_RESULTS_%s' % total, link) else: output.write_tsv(row.issn, 'ERR_NO_MATCH', link)
def run(self): if not self.isil == 'DE-15': raise RuntimeError('not implemented except for DE-15') cache = URLCache(directory=os.path.join(tempfile.gettempdir(), '.urlcache')) adapter = requests.adapters.HTTPAdapter(max_retries=3) cache.sess.mount('http://', adapter) with self.input().open() as handle: with self.output().open('w') as output: for i, row in enumerate(handle.iter_tsv(cols=('issn', 'status'))): if row.status == 'NOT_FOUND': link = 'https://katalog.ub.uni-leipzig.de/Search/Results?lookfor=%s&type=ISN' % row.issn self.logger.info('fetch #%05d: %s' % (i, link)) body = cache.get(link) if 'Keine Ergebnisse!' in body: output.write_tsv(row.issn, 'ERR_NOT_IN_CATALOG', link) else: soup = BeautifulSoup(body) rs = soup.findAll("div", {"class": "floatleft"}) if len(rs) == 0: output.write_tsv(row.issn, 'ERR_LAYOUT', link) continue first = rs[0] match = re.search(r'Treffer([0-9]+)-([0-9]+)von([0-9]+)', first.text) if match: total = match.group(3) output.write_tsv(row.issn, 'FOUND_RESULTS_%s' % total, link) else: output.write_tsv(row.issn, 'ERR_NO_MATCH', link)
def run(self): if self.isil != 'DE-15': raise RuntimeError('not implemented except for DE-15') cache = URLCache(directory=os.path.join(tempfile.gettempdir(), '.urlcache')) adapter = requests.adapters.HTTPAdapter(max_retries=3) cache.sess.mount('http://', adapter) finc = self.config.get('ai', 'finc-solr') ai = self.config.get('ai', 'ai-solr') def numFound(link): """ Given a SOLR query URL, return the number of docs found. """ body = cache.get(link) content = json.loads(body) return content['response']['numFound'] with self.input().open() as handle: with self.output().open('w') as output: for i, row in enumerate(handle.iter_tsv(cols=('issn', 'status'))): if row.status == 'NOT_FOUND': link = '%s/select?q=institution:%s+AND+issn:%s&wt=json' % (finc, self.isil, row.issn) self.logger.info('fetch #%05d: %s', i, link) output.write_tsv('finc', row.issn, numFound(link), link) else: link = '%s/select?q=institution:%s+AND+issn:%s&wt=json' % (ai, self.isil, row.issn) self.logger.info('fetch #%05d: %s', i, link) output.write_tsv('ai', row.issn, numFound(link), link)
def run(self): """ > Using large offset values can result in extremely long response times. Offsets in the 100,000s and beyond will likely cause a timeout before the API is able to respond. An alternative to paging through very large result sets (like a corpus used for text and data mining) it to use the API's exposure of Solr's deep paging cursors. Any combination of query, filters and facets may be used with deep paging cursors. While rows may be specified along with cursor, offset and sample cannot be used. To use deep paging make a query as normal, but include the cursor parameter with a value of * (https://git.io/vFyAn). > But we prefer carrots to sticks. As of September 18th 2017 any API queries that use HTTPS and have appropriate contact information will be directed to a special pool of API machines that are reserved for polite users. (https://git.io/vFyN5), refs #9059. """ cache = URLCache( directory=os.path.join(tempfile.gettempdir(), '.urlcache')) adapter = requests.adapters.HTTPAdapter(max_retries=self.max_retries) cache.sess.mount('http://', adapter) filter = "from-{self.filter}-date:{self.begin},until-{self.filter}-date:{self.end}".format( self=self) rows, offset = self.rows, 0 cursor = '*' with self.output().open('w') as output: params = {'rows': rows, 'filter': filter} while True: params['cursor'] = cursor # Do not fail, if user has not configured mailto, https://git.io/vFyN5. if self.config.get('crossref', 'mailto', fallback=None): params['mailto'] = self.config.get('crossref', 'mailto') url = 'https://api.crossref.org/works?%s' % ( urllib.parse.urlencode(params)) for attempt in range(1, self.attempts): if not cache.is_cached(url): time.sleep(self.sleep) body = cache.get(url) try: content = json.loads(body) except ValueError as err: if attempt == self.attempts - 1: self.logger.debug('URL was %s', url) self.logger.debug(err) self.logger.debug(body[:100] + '...') raise cache_file = cache.get_cache_file(url) if os.path.exists(cache_file): self.logger.debug( 'trying to recover by removing cached entry at %s', cache_file) os.remove(cache_file) else: break count = len(content['message']['items']) self.logger.debug("%s: %s", url, count) if count == 0: break body = body + '\n' if isinstance(body, string_types): body = body.encode('utf-8') output.write(body) offset += rows if not 'next-cursor' in content['message']: raise RuntimeError('missing key: next-cursor') cursor = content['message']['next-cursor']
def run(self): """ > Using large offset values can result in extremely long response times. Offsets in the 100,000s and beyond will likely cause a timeout before the API is able to respond. An alternative to paging through very large result sets (like a corpus used for text and data mining) it to use the API's exposure of Solr's deep paging cursors. Any combination of query, filters and facets may be used with deep paging cursors. While rows may be specified along with cursor, offset and sample cannot be used. To use deep paging make a query as normal, but include the cursor parameter with a value of * (https://git.io/vFyAn). > But we prefer carrots to sticks. As of September 18th 2017 any API queries that use HTTPS and have appropriate contact information will be directed to a special pool of API machines that are reserved for polite users. (https://git.io/vFyN5), refs #9059. """ cache = URLCache(directory=os.path.join(tempfile.gettempdir(), '.urlcache')) adapter = requests.adapters.HTTPAdapter(max_retries=self.max_retries) cache.sess.mount('http://', adapter) filter = "from-{self.filter}-date:{self.begin},until-{self.filter}-date:{self.end}".format(self=self) rows, offset = self.rows, 0 cursor = '*' with self.output().open('w') as output: params = {'rows': rows, 'filter': filter} while True: params['cursor'] = cursor # Do not fail, if user has not configured mailto, https://git.io/vFyN5. if self.config.get('crossref', 'mailto', fallback=None): params['mailto'] = self.config.get('crossref', 'mailto') url = 'https://api.crossref.org/works?%s' % (urllib.parse.urlencode(params)) for attempt in range(1, self.attempts): if not cache.is_cached(url): time.sleep(self.sleep) body = cache.get(url) try: content = json.loads(body) except ValueError as err: if attempt == self.attempts - 1: self.logger.debug('URL was %s', url) self.logger.debug(err) self.logger.debug(body[:100] + '...') raise cache_file = cache.get_cache_file(url) if os.path.exists(cache_file): self.logger.debug('trying to recover by removing cached entry at %s', cache_file) os.remove(cache_file) else: break count = len(content['message']['items']) self.logger.debug("%s: %s", url, count) if count == 0: break output.write(body + '\n') offset += rows if not 'next-cursor' in content['message']: raise RuntimeError('missing key: next-cursor') cursor = content['message']['next-cursor']
def test_get_cache_file(tmpdir): cache = URLCache(directory=str(tmpdir)) fn = cache.get_cache_file("http://x.com") assert fn.startswith(str(tmpdir))