def cdx_sort_closest(closest, cdx_iter, limit=10): """ sort CDXCaptureResult by closest to timestamp. """ closest_cdx = [] closest_sec = timestamp_to_sec(closest) for cdx in cdx_iter: sec = timestamp_to_sec(cdx['timestamp']) key = abs(closest_sec - sec) # create tuple to sort by key bisect.insort(closest_cdx, (key, cdx)) if len(closest_cdx) == limit: # assuming cdx in ascending order and keys have started increasing if key > closest_cdx[-1]: break if len(closest_cdx) > limit: closest_cdx.pop() for cdx in itertools.imap(lambda x: x[1], closest_cdx): yield cdx
def cdx_sort_closest(closest, cdx_iter, limit=10): """ sort CDXCaptureResult by closest to timestamp. """ closest_cdx = [] closest_keys = [] closest_sec = timestamp_to_sec(closest) for cdx in cdx_iter: sec = timestamp_to_sec(cdx[TIMESTAMP]) key = abs(closest_sec - sec) # create tuple to sort by key #bisect.insort(closest_cdx, (key, cdx)) i = bisect.bisect_right(closest_keys, key) closest_keys.insert(i, key) closest_cdx.insert(i, cdx) if len(closest_cdx) == limit: # assuming cdx in ascending order and keys have started increasing if key > closest_keys[-1]: break if len(closest_cdx) > limit: closest_cdx.pop() for cdx in closest_cdx: yield cdx
def cdx_sort_closest(closest, cdx_iter, limit=10): """ sort CDXCaptureResult by closest to timestamp. """ closest_cdx = [] closest_sec = timestamp_to_sec(closest) for cdx in cdx_iter: sec = timestamp_to_sec(cdx[TIMESTAMP]) key = abs(closest_sec - sec) # create tuple to sort by key bisect.insort(closest_cdx, (key, cdx)) if len(closest_cdx) == limit: # assuming cdx in ascending order and keys have started increasing if key > closest_cdx[-1]: break if len(closest_cdx) > limit: closest_cdx.pop() for cdx in itertools.imap(lambda x: x[1], closest_cdx): yield cdx
def cdx_index(self, z_key, stream, filename): cdxout = BytesIO() write_cdx_index(cdxout, stream, filename, cdxj=True, append_post=True) cdx_list = cdxout.getvalue().rstrip().split(b'\n') count = 0 min_ = max_ = None for cdx in cdx_list: if cdx and not self.dry: self.dst_redis.zadd(z_key, 0, cdx) cdxobj = CDXObject(cdx) ts = cdxobj['timestamp'] min_ = min(min_, ts) if min_ else ts max_ = max(max_, ts) if max_ else ts count += 1 if count: min_ = timestamp_to_sec(min_) max_ = timestamp_to_sec(max_) logging.info(' CDXJ: {0} {1} {2}'.format(count, min_, max_)) return min_, max_
def handle_not_found(self, wbrequest, nfe): response = super(MementoHandler, self).handle_not_found(wbrequest, nfe) if (not wbrequest.wb_url.is_query() and wbrequest.referrer and wbrequest.referrer.startswith(wbrequest.wb_prefix)): wb_url = WbUrl(wbrequest.referrer[len(wbrequest.wb_prefix):]) status = response.status_headers.get_statuscode() if status.startswith('4') and not self.skip_missing_count(wb_url): key_name = 'MISSING ' elif status.startswith('2'): key_name = 'LIVE ' else: key_name = None if key_name: page_key = redis_client.get_url_key(wb_url) ts = timestamp_now() value = (key_name + ts + ' ' + wbrequest.wb_url.url) save_value = str(timestamp_to_sec(ts)) save_value += ' ' + 'text/html' redis_client.set_embed_entry(page_key, value, save_value) return response
def __call__(self, cdx, skip_hosts, cdx_loader, wbrequest): self.session.cookies.clear() try_urls, host, archive_name = self._get_urls_to_try( cdx, skip_hosts, wbrequest) try: response = self._do_req(try_urls, host, cdx, wbrequest.env, skip_hosts) except Exception as e: print(e) response = None if response is None: print(skip_hosts) raise CaptureException('Content Could Not Be Loaded') if response.status_code >= 300 and response.status_code < 400: self.unrewrite_header(response, 'Location') self.unrewrite_header(response, 'Content-Location') remote = wbrequest.env.get('REMOTE_ADDR') req_ts = wbrequest.wb_url.timestamp base_key = remote + ':' + req_ts sec = timestamp_to_sec(cdx['timestamp']) referrer = wbrequest.env.get('HTTP_REFERER') try: pi = redisclient.redis.pipeline(transaction=False) pi.hset(base_key + ':urls', cdx['url'], sec) pi.sadd(base_key + ':hosts', archive_name) if referrer and not referrer.endswith('.css'): pi.set(base_key + ':ref', referrer) elif not referrer: pi.set(base_key + ':base', cdx['url']) pi.execute() except Exception as e: import traceback traceback.print_exc(e) statusline = str(response.status_code) + ' ' + response.reason headers = response.headers.items() stream = response.raw status_headers = StatusAndHeaders(statusline, headers) return (status_headers, stream)
def zadd_cdx(source, cdx, key): if key: source.redis.zadd(key, 0, cdx) return parts = cdx.split(' ', 2) key = parts[0] timestamp = parts[1] rest = timestamp + ' ' + parts[2] score = timestamp_to_sec(timestamp) source.redis.zadd(source.key_prefix + key, score, rest)
def __call__(self, cdx, skip_hosts, cdx_loader, wbrequest): self.session.cookies.clear() try_urls, host, archive_name = self._get_urls_to_try(cdx, skip_hosts, wbrequest) try: response = self._do_req(try_urls, host, wbrequest.env, skip_hosts) except Exception as e: print(e) response = None if response is None: print(skip_hosts) raise CaptureException('Content Could Not Be Loaded') if response.status_code >= 300 and response.status_code < 400: self.unrewrite_header(response, 'Location') self.unrewrite_header(response, 'Content-Location') remote = wbrequest.env.get('REMOTE_ADDR') req_ts = wbrequest.wb_url.timestamp base_key = remote + ':' + req_ts sec = timestamp_to_sec(cdx['timestamp']) referrer = wbrequest.env.get('HTTP_REFERER') try: pi = redisclient.redis.pipeline(transaction=False) pi.hset(base_key + ':urls', cdx['url'], sec) pi.sadd(base_key + ':hosts', archive_name) if referrer and not referrer.endswith('.css'): pi.set(base_key + ':ref', referrer) elif not referrer: pi.set(base_key + ':base', cdx['url']) pi.execute() except Exception as e: import traceback traceback.print_exc(e) statusline = str(response.status_code) + ' ' + response.reason headers = response.headers.items() stream = response.raw status_headers = StatusAndHeaders(statusline, headers) return (status_headers, stream)
def sort_cci_timestamp(self, cci_iter, query): sorted_cci = [] limit = query.limit if query.closest: closest_sec = timestamp_to_sec(query.closest) * 1000 key_func = lambda x: abs(closest_sec - x) elif query.reverse: key_func = lambda x: -x else: key_func = lambda x: x for cci in cci_iter: key = key_func(cci.data['arcFileDate']) # create tuple to sort by key bisect.insort(sorted_cci, (key, cci)) if len(sorted_cci) > limit: sorted_cci.pop() for cci in itertools.imap(lambda x: x[1], sorted_cci): yield cci
def __init__(self, api_loader, url, timestamp): self.api_loader = api_loader self.url = url self.target_timestamp = timestamp self.target_sec = timestamp_to_sec(timestamp)
def format_ts(value, format_="%a, %b %d %Y %H:%M:%S"): if format_ == "%s": return timestamp_to_sec(value) else: value = timestamp_to_datetime(value) return value.strftime(format_)
def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'): if format_ == '%s': return timestamp_to_sec(value) else: value = timestamp_to_datetime(value) return value.strftime(format_)