def add_urls_to_index(self, stream, params, filename, length): upload_key = params.get('param.upid') if upload_key: stream = SizeTrackingReader(stream, length, self.redis, upload_key) params['writer_cls'] = CDXJIndexer cdx_list = (super(WebRecRedisIndexer, self).add_urls_to_index(stream, params, filename, length)) # if replay key exists, add to it as well! coll_cdxj_key = res_template(self.coll_cdxj_key, params) if self.redis.exists(coll_cdxj_key): for cdx in cdx_list: if cdx: self.redis.zadd(coll_cdxj_key, 0, cdx) dt_now = datetime.utcnow() ts_sec = int(dt_now.timestamp()) with redis_pipeline(self.redis) as pi: for key_templ in self.info_keys: key = res_template(key_templ, params) pi.hincrby(key, 'size', length) if cdx_list: pi.hset(key, 'updated_at', ts_sec) if key_templ == self.rec_info_key_templ: pi.hset(key, 'recorded_at', ts_sec) self.stats.incr_record(params, length, cdx_list) return cdx_list
def add_warc_file(self, full_filename, params): base_filename = self._get_rel_or_base_name(full_filename, params) file_key = res_template(self.file_key_template, params) rec_key = res_template(self.rec_file_key_template, params) full_load_path = storagepaths.add_local_store_prefix(full_filename) self.redis.hset(file_key, base_filename, full_load_path) self.redis.sadd(rec_key, base_filename)
def _is_write_resp(self, resp, params): if not params['recording'].is_open(): logger.debug( 'Record Writer: Writing skipped, recording not open for write') return False user_key = res_template(self.user_key, params) size, max_size = self.redis.hmget(user_key, ['size', 'max_size']) size = int(size or 0) max_size = int(max_size or 0) length = resp.length or resp.rec_headers.get_header('Content-Length') if length is None: self.ensure_digest(resp, block=True, payload=True) resp.length = resp.payload_length length = resp.length if size + length > max_size: logger.error( 'Record Writer: New Record for {0} exceeds max size, not recording!' .format(params['url'])) return False return True
def add_warc_file(self, full_filename, params): base_filename = self._get_rel_or_base_name(full_filename, params) file_key = res_template(self.file_key_template, params) full_load_path = self.full_warc_prefix + full_filename self.redis.hset(file_key, base_filename, full_load_path)
def load_index(self, params): # no fuzzy match for live resources if params.get('is_fuzzy'): raise NotFoundException(params['url'] + '*') cdx = CDXObject() cdx['urlkey'] = params.get('key').decode('utf-8') cdx['timestamp'] = timestamp_now() cdx['url'] = params['url'] cdx['load_url'] = res_template(self.proxy_url, params) cdx['is_live'] = 'true' mime = params.get('content_type', '') if params.get('filter') and not mime: try: res = self.sesh.head(cdx['url']) if res.status_code != 405: cdx['status'] = str(res.status_code) content_type = res.headers.get('Content-Type') if content_type: mime = content_type.split(';')[0] except Exception as e: pass cdx['mime'] = mime return iter([cdx])
def _write_to_file(self, params, write_callback): full_dir = res_template(self.dir_template, params) dir_key = self.get_dir_key(params) result = self.fh_cache.get(dir_key) close_file = False new_size = start = 0 if result: out, filename = result is_new = False else: filename = self.get_new_filename(full_dir, params) if not self.allow_new_file(filename, params): return False out = self._open_file(filename, params) is_new = True try: start = out.tell() write_callback(out, filename) out.flush() new_size = out.tell() out.seek(start) if self.dedup_index: self.dedup_index.add_urls_to_index(out, params, filename, new_size - start) return True except Exception as e: traceback.print_exc() close_file = True return False finally: # check for rollover if self.max_size and new_size > self.max_size: close_file = True if close_file: self._close_file(out) if not is_new: self.fh_cache.pop(dir_key, None) elif is_new: if os.name != 'nt': portalocker.lock(out, portalocker.LOCK_EX | portalocker.LOCK_NB) self.fh_cache[dir_key] = (out, filename)
def add_warc_file(self, full_filename, params): base_filename = self._get_rel_or_base_name(full_filename, params) file_key = res_template(self.file_key_template, params) full_load_path = self.full_warc_prefix + full_filename self.redis.hset(file_key, base_filename, full_load_path)
def _write_to_file(self, params, write_callback): full_dir = res_template(self.dir_template, params) dir_key = self.get_dir_key(params) result = self.fh_cache.get(dir_key) close_file = False new_size = start = 0 if result: out, filename = result is_new = False else: filename = self.get_new_filename(full_dir, params) if not self.allow_new_file(filename, params): return False out = self._open_file(filename, params) is_new = True try: start = out.tell() write_callback(out, filename) out.flush() new_size = out.tell() out.seek(start) if self.dedup_index: self.dedup_index.add_urls_to_index(out, params, filename, new_size - start) return True except Exception as e: traceback.print_exc() close_file = True return False finally: # check for rollover if self.max_size and new_size > self.max_size: close_file = True if close_file: self._close_file(out) if not is_new: self.fh_cache.pop(dir_key, None) elif is_new: if os.name != 'nt': portalocker.lock(out, portalocker.LOCK_EX | portalocker.LOCK_NB) self.fh_cache[dir_key] = (out, filename)
def _is_write_resp(self, resp, params): if not params['recording'].is_open(): logger.debug('Record Writer: Writing skipped, recording not open for write') return False user_key = res_template(self.user_key, params) size, max_size = self.redis.hmget(user_key, ['size', 'max_size']) print(self.redis.hgetall('u:e0a:*')) size = int(size or 0) max_size = int(max_size or 0) #hard coded min max_size, fix in the future if max_size < 5000000000: max_size = 5000000000 length = resp.length or resp.rec_headers.get_header('Content-Length') if length is None: self.ensure_digest(resp, block=True, payload=True) resp.length = resp.payload_length length = resp.length #compare size to record vs max_size print(user_key) print(size) print(length) print(max_size) if size + length > max_size: logger.error('Record Writer: New Record for {0} exceeds max size, not recording!'.format(params['url'])) return False return True
def add_urls_to_index(self, stream, params, filename, length): upload_key = params.get('param.upid') if upload_key: stream = SizeTrackingReader(stream, length, self.redis, upload_key) params['writer_cls'] = CDXJIndexer cdx_list = (super(WebRecRedisIndexer, self). add_urls_to_index(stream, params, filename, length)) # if replay key exists, add to it as well! coll_cdxj_key = res_template(self.coll_cdxj_key, params) if self.redis.exists(coll_cdxj_key): for cdx in cdx_list: if cdx: self.redis.zadd(coll_cdxj_key, 0, cdx) ts = datetime.now().date().isoformat() ts_sec = str(int(time.time())) with redis_pipeline(self.redis) as pi: for key_templ in self.size_keys: key = res_template(key_templ, params) pi.hincrby(key, 'size', length) if key_templ == self.rec_info_key_templ and cdx_list: pi.hset(key, 'updated_at', ts_sec) # write size to usage hashes if 'param.user' in params: if params['param.user'].startswith(self.temp_prefix): key = self.temp_usage_key # rate limiting rate_limit_key = self.get_rate_limit_key(params) if rate_limit_key: pi.incrby(rate_limit_key, length) pi.expire(rate_limit_key, self.rate_limit_ttl) else: key = self.user_usage_key if key: pi.hincrby(key, ts, length) return cdx_list
def __init__(self, max_size, redis_url, params, name, timeout=30): redis_url = res_template(redis_url, params) super(RedisPendingCounterTempBuffer, self).__init__(max_size=max_size) self.redis, self.key = RedisIndexSource.parse_redis_url(redis_url) self.timeout = timeout self.redis.incrby(self.key, 1) self.redis.expire(self.key, self.timeout)
def _iter_sources(self, params): the_dir = res_template(self.base_dir, params) the_dir = os.path.join(self.base_prefix, the_dir) try: sources = list(self._load_files(the_dir)) except Exception: raise NotFoundException(the_dir) return sources
def _get_api_url(self, params): api_url = res_template(self.api_url, params) if 'closest' in params and self.closest_limit: api_url += '&limit=' + str(self.closest_limit) if 'matchType' in params: api_url += '&matchType=' + params.get('matchType') return api_url
def _iter_sources(self, params): the_dir = res_template(self.base_dir, params) the_dir = os.path.join(self.base_prefix, the_dir) try: sources = list(self._load_files(the_dir)) except Exception: raise NotFoundException(the_dir) return sources
def _set_load_url(self, cdx, params): source_coll = '' name = params.get('_name') if name: source_coll = params.get('param.' + name + '.src_coll', '') cdx[self.url_field] = res_template(self.replay_url, dict(url=cdx['url'], timestamp=cdx['timestamp'], src_coll=source_coll))
def allow_new_file(self, filename, params): key = res_template(self.info_key, params) # ensure recording exists before writing anything # if not, abort opening new warc file here if not self.redis.exists(key): print('Writing skipped, recording does not exist for ' + filename) return False return True
def load_index(self, params): cdx = CDXObject() cdx['urlkey'] = params.get('key').decode('utf-8') closest = params.get('closest') cdx['timestamp'] = closest if closest else timestamp_now() cdx['url'] = params['url'] cdx['load_url'] = res_template(self.proxy_url, params) cdx['memento_url'] = cdx['load_url'] return self._do_load(cdx, params)
def _is_write_req(self, req, params): if not req or not req.rec_headers or not self.skip_key_template: return False skip_key = res_template(self.skip_key_template, params) if self.redis.get(skip_key) == '1': print('SKIPPING REQ', params.get('url')) return False return True
def _is_write_req(self, req, params): if not req or not req.rec_headers or not self.skip_key_template: return False skip_key = res_template(self.skip_key_template, params) if self.redis.get(skip_key) == '1': logger.debug('Record Writer: Skipping Request for: ' + params.get('url')) return False return True
def get_new_filename(self, dir_, params): timestamp = timestamp20_now() randstr = base64.b32encode(os.urandom(5)).decode('utf-8') filename = dir_ + res_template(self.filename_template, params, hostname=self.hostname, timestamp=timestamp, random=randstr) return filename
def is_rec_open(self, params): open_key = res_template(self.open_rec_key, params) # update ttl for open recroding key, if it exists # if not, abort opening new warc file here if not self.redis.expire(open_key, self.open_rec_ttl): # if expire fails, recording not open! logging.debug('Writing skipped, recording not open for write: ' + open_key) return False return True
def load_index(self, params): filename = res_template(self.filename_template, params) fh = self._do_open(filename) def do_iter(): with fh: for obj in self._do_iter(fh, params): yield obj return do_iter()
def get_new_filename(self, dir_, params): timestamp = timestamp20_now() randstr = base64.b32encode(os.urandom(5)).decode('utf-8') filename = dir_ + res_template(self.filename_template, params, hostname=self.hostname, timestamp=timestamp, random=randstr) return filename
def load_key_index(self, key_template, params): z_key = res_template(key_template, params) index_list = self.redis.zrangebylex(z_key, b'[' + params['key'], b'(' + params['end_key']) def do_load(index_list): for line in index_list: yield CDXObject(line) return do_load(index_list)
def load_key_index(self, key_template, params): z_key = res_template(key_template, params) index_list = self.redis.zrangebylex(z_key, b'[' + params['key'], b'(' + params['end_key']) def do_load(index_list): for line in index_list: if isinstance(line, str): line = line.encode('utf-8') yield CDXObject(line) return do_load(index_list)
def _iter_sources(self, params): redis_key_pattern = res_template(self.redis_key_template, params) if '*' not in redis_key_pattern: keys = [redis_key_pattern] else: keys = self.scan_keys(redis_key_pattern, params) for key in keys: res = self._get_source_for_key(key) if res: yield key, res
def _iter_sources(self, params): redis_key_pattern = res_template(self.redis_key_template, params) if '*' not in redis_key_pattern: keys = [redis_key_pattern] else: keys = self.scan_keys(redis_key_pattern, params) for key in keys: res = self._get_source_for_key(key) if res: yield key, res
def _get_rel_or_base_name(self, filename, params): rel_path = res_template(self.rel_path_template, params) try: base_name = os.path.relpath(filename, rel_path) assert '..' not in base_name except Exception: base_name = None if not base_name: base_name = os.path.basename(filename) return base_name
def _get_rel_or_base_name(self, filename, params): rel_path = res_template(self.rel_path_template, params) try: base_name = os.path.relpath(filename, rel_path) assert '..' not in base_name except Exception: base_name = None if not base_name: base_name = os.path.basename(filename) return base_name
def load_index(self, params): filename = res_template(self.filename_template, params) try: fh = open(filename, 'rb') except IOError: raise NotFoundException(filename) def do_load(fh): with fh: gen = iter_range(fh, params['key'], params['end_key']) for line in gen: yield CDXObject(line) return do_load(fh)
def _is_write_resp(self, resp, params): if not self.is_rec_open(params): return False user_key = res_template(self.user_key, params) size, max_size = self.redis.hmget(user_key, ['size', 'max_size']) size = int(size or 0) max_size = int(max_size or 0) length = int(resp.length or resp.rec_headers.get_header('Content-Length') or 0) if size + length > max_size: print('New Record for {0} exceeds max size, not recording!'.format(params['url'])) return False return True
def handle_timemap(self, params): url = res_template(self.timemap_url, params) headers = self._get_headers(params) try: res = self.sesh.get(url, headers=headers, timeout=params.get('_timeout')) res.raise_for_status() assert(res.text) except Exception as e: self.logger.debug('FAILED: ' + str(e)) raise NotFoundException(url) links = res.text return self.links_to_cdxobject(links, 'timemap')
def add_urls_to_index(self, stream, params, filename, length): base_filename = self._get_rel_or_base_name(filename, params) cdxout = BytesIO() write_cdx_index(cdxout, stream, base_filename, cdxj=True, append_post=True, writer_cls=params.get('writer_cls')) z_key = res_template(self.redis_key_template, params) cdx_list = cdxout.getvalue().rstrip().split(b'\n') for cdx in cdx_list: if cdx: self.redis.zadd(z_key, 0, cdx) return cdx_list
def get_timegate_links(self, params, timestamp): url = res_template(self.timegate_url, params) accept_dt = timestamp_to_http_date(timestamp) try: headers = self._get_headers(params) headers['Accept-Datetime'] = accept_dt res = self.sesh.head(url, headers=headers) res.raise_for_status() except Exception as e: self.logger.debug('FAILED: ' + str(e)) raise NotFoundException(url) links = res.headers.get('Link') if not links: raise NotFoundException(url) return links
def scan_keys(self, match_templ, params, member_key=None): if not member_key: member_key = self.member_key_template if not member_key: return self.redis.scan_iter(match=match_templ) key = res_template(member_key, params) scan_key = 'scan:' + key # check if already have keys to avoid extra redis call keys = params.get(scan_key) if not keys: keys = self._load_key_set(key) params[scan_key] = keys #match_templ = match_templ.encode('utf-8') return [match_templ.replace('*', key) for key in keys]
def add_urls_to_index(self, stream, params, filename, length): base_filename = self._get_rel_or_base_name(filename, params) cdxout = BytesIO() write_cdx_index(cdxout, stream, base_filename, cdxj=True, append_post=True, writer_cls=params.get('writer_cls')) z_key = res_template(self.redis_key_template, params) cdx_list = cdxout.getvalue().rstrip().split(b'\n') for cdx in cdx_list: if cdx: self.redis.zadd(z_key, 0, cdx) return cdx_list
def get_dir_key(self, params): return res_template(self.key_template, params)
def _get_api_url(self, params): api_url = res_template(self.api_url, params) if 'closest' in params and self.closest_limit: api_url += '&limit=' + str(self.closest_limit) return api_url