def load_resource(self, cdx, params): load_url = cdx.get('load_url') if not load_url: return None if params.get('content_type') != self.CONTENT_TYPE: return None if not self.ydl: return None info = self.ydl.extract_info(load_url) info_buff = json.dumps(info) info_buff = info_buff.encode('utf-8') warc_headers = {} schema, rest = load_url.split('://', 1) target_url = 'metadata://' + rest dt = timestamp_to_datetime(cdx['timestamp']) warc_headers['WARC-Type'] = 'metadata' warc_headers['WARC-Record-ID'] = self._make_warc_id() warc_headers['WARC-Target-URI'] = target_url warc_headers['WARC-Date'] = datetime_to_iso_date(dt) warc_headers['Content-Type'] = self.CONTENT_TYPE warc_headers['Content-Length'] = str(len(info_buff)) warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items()) return warc_headers, None, BytesIO(info_buff)
def lookup(self, digest, url, timestamp): start, end = calc_search_range(url, 'exact') results = self.redis.zrangebylex(self.key, '[' + start, '(' + end) for res in results: cdx = CDXObject(res) if digest == cdx.get('digest'): return ('revisit', cdx['url'], timestamp_to_datetime(cdx['timestamp'])) return None
def format_ts(value, format_="%a, %b %d %Y %H:%M:%S"): if format_ == "%s": return timestamp_to_sec(value) else: value = timestamp_to_datetime(value) return value.strftime(format_)
def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'): value = timestamp_to_datetime(value) return value.strftime(format_)
def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'): if format_ == '%s': return timestamp_to_sec(value) else: value = timestamp_to_datetime(value) return value.strftime(format_)
def load_resource(self, cdx, params): load_url = cdx.get('load_url') if not load_url: return None if params.get('content_type') == VideoLoader.CONTENT_TYPE: return None input_req = params['_input_req'] req_headers = input_req.get_req_headers() dt = timestamp_to_datetime(cdx['timestamp']) if cdx.get('memento_url'): req_headers['Accept-Datetime'] = datetime_to_http_date(dt) method = input_req.get_req_method() data = input_req.get_req_body() p = PreparedRequest() p.prepare_url(load_url, None) p.prepare_headers(None) p.prepare_auth(None, load_url) auth = p.headers.get('Authorization') if auth: req_headers['Authorization'] = auth load_url = p.url try: upstream_res = self.pool.urlopen(method=method, url=load_url, body=data, headers=req_headers, redirect=False, assert_same_host=False, preload_content=False, decode_content=False, retries=self.num_retries, timeout=params.get('_timeout')) except Exception as e: raise LiveResourceException(load_url) memento_dt = upstream_res.headers.get('Memento-Datetime') if memento_dt: dt = http_date_to_datetime(memento_dt) cdx['timestamp'] = datetime_to_timestamp(dt) elif cdx.get('memento_url'): # if 'memento_url' set and no Memento-Datetime header present # then its an error return None agg_type = upstream_res.headers.get('WebAgg-Type') if agg_type == 'warc': cdx['source'] = unquote(upstream_res.headers.get('WebAgg-Source-Coll')) return None, upstream_res.headers, upstream_res self.raise_on_self_redirect(params, cdx, str(upstream_res.status), upstream_res.headers.get('Location')) if upstream_res.version == 11: version = '1.1' else: version = '1.0' status = 'HTTP/{version} {status} {reason}\r\n' status = status.format(version=version, status=upstream_res.status, reason=upstream_res.reason) http_headers_buff = status orig_resp = upstream_res._original_response try: #pragma: no cover #PY 3 resp_headers = orig_resp.headers._headers for n, v in resp_headers: if n.lower() in self.SKIP_HEADERS: continue http_headers_buff += n + ': ' + v + '\r\n' except: #pragma: no cover #PY 2 resp_headers = orig_resp.msg.headers for n, v in zip(orig_resp.getheaders(), resp_headers): if n in self.SKIP_HEADERS: continue http_headers_buff += v http_headers_buff += '\r\n' http_headers_buff = http_headers_buff.encode('latin-1') try: fp = upstream_res._fp.fp if hasattr(fp, 'raw'): #pragma: no cover fp = fp.raw remote_ip = fp._sock.getpeername()[0] except: #pragma: no cover remote_ip = None warc_headers = {} warc_headers['WARC-Type'] = 'response' warc_headers['WARC-Record-ID'] = self._make_warc_id() warc_headers['WARC-Target-URI'] = cdx['url'] warc_headers['WARC-Date'] = datetime_to_iso_date(dt) if remote_ip: warc_headers['WARC-IP-Address'] = remote_ip warc_headers['Content-Type'] = 'application/http; msgtype=response' self._set_content_len(upstream_res.headers.get('Content-Length', -1), warc_headers, len(http_headers_buff)) warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items()) return (warc_headers, http_headers_buff, upstream_res)
def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'): value = timestamp_to_datetime(value) return value.strftime(format_)
def __call__(self, cdx, params): dt = timestamp_to_datetime(cdx['timestamp']) return ('revisit', cdx['url'], datetime_to_iso_date(dt))