def put_custom_record(self, environ, coll): chunks = [] while True: buff = environ["wsgi.input"].read() print("LEN", len(buff)) if not buff: break chunks.append(buff) data = b"".join(chunks) params = dict(parse_qsl(environ.get("QUERY_STRING"))) rec_type = "resource" headers = {"Content-Type": environ.get("CONTENT_TYPE", "text/plain")} target_uri = params.get("url") if not target_uri: return WbResponse.json_response({"error": "no url"}) timestamp = params.get("timestamp") if timestamp: headers["WARC-Date"] = timestamp_to_iso_date(timestamp) put_url = self.custom_record_path.format(url=target_uri, coll=coll, rec_type=rec_type) res = requests.put(put_url, headers=headers, data=data) res = res.json() return WbResponse.json_response(res)
def serialize_json_pages(self, pages, id, title, desc=None, has_text=False): page_header = {"format": "json-pages-1.0", "id": id} if title: page_header["title"] = title if desc: page_header["description"] = desc if has_text: page_header["hasText"] = True yield json.dumps(page_header) + "\n" for line in pages: ts = timestamp_to_iso_date(line["timestamp"]) page_title = line.get("title") uid = line.get("id") or line.get("page_id") or shortuuid.uuid() data = {"id": uid, "url": line["url"], "ts": ts} if page_title: data["title"] = page_title if "text" in line: data["text"] = line["text"] yield json.dumps(data) + "\n"
def ingest(self, coll, text, params): parsed = json.loads(text) mdata = {} content = "\n".join(text for text in extract_text(parsed["root"], mdata)) title = mdata.get('title') url = params.get('url') timestamp_ss = params.get('timestamp') timestamp_dts = timestamp_to_iso_date(timestamp_ss) has_screenshot_b = params.get('hasScreenshot') == '1' title = title or url digest = self.get_digest(content) if self.update_if_dupe(digest, coll, url, timestamp_ss, timestamp_dts): return data = { 'coll_s': coll, 'title_t': title, 'content_t': content, 'url_s': url, 'digest_s': digest, 'timestamp_ss': timestamp_ss, 'timestamp_dts': timestamp_dts, 'has_screenshot_b': has_screenshot_b, } result = requests.post(self.solr_api, json=data)
def create_redirect_record(self, url, redirect_url, timestamp, status='301'): warc_headers = {} warc_headers['WARC-Date'] = timestamp_to_iso_date(timestamp) #content = 'Redirect to ' + redirect_url content = '' payload = content.encode('utf-8') headers_list = [('Content-Length', str(len(payload))), ('Location', redirect_url)] http_headers = StatusAndHeaders(status + ' Redirect', headers_list, protocol='HTTP/1.0') rec = self.writer.create_warc_record(url, 'response', payload=BytesIO(payload), length=len(payload), http_headers=http_headers, warc_headers_dict=warc_headers) self.writer.write_record(rec) return rec
def ingest(self, text, params): # text already parsed content = text.decode('utf-8') title = params.get('title') or params.get('url') url = params.get('url') timestamp_s = params.get('timestamp') or timestamp_now() timestamp_dt = timestamp_to_iso_date(timestamp_s) has_screenshot_b = params.get('hasScreenshot') == '1' title = title or url digest = self.get_digest(content) #if self.update_if_dupe(digest, coll, url, timestamp_ss, timestamp_dts): # return data = { 'user_s': params.get('user'), 'coll_s': params.get('coll'), 'rec_s': params.get('rec'), 'id': params.get('pid'), 'title_t': title, 'content_t': content, 'url_s': url, 'digest_s': digest, 'timestamp_s': timestamp_s, 'timestamp_dt': timestamp_dt, 'has_screenshot_b': has_screenshot_b, } result = requests.post(self.solr_api, json=data)
def _set_fixed_dt(self, fixed_dt): if not fixed_dt: return None fixed_dt = DATE_TIMESPLIT.sub('', fixed_dt) fixed_dt = pad_timestamp(fixed_dt, PAD_14_DOWN) fixed_dt = timestamp_to_iso_date(fixed_dt) return fixed_dt
def serialize_cdxj_pages(self, pages): yield '!meta 0 ' + json.dumps({'format': 'cdxj-pages-1.0', 'title': 'All Pages'}) for line in pages.values(): ts = timestamp_to_iso_date(line['timestamp']) title = quote(line.get('title') or line.get('url'), safe=':/?&=') data = {'url': line['url']} if 'text' in line: data['text'] = line['text'] yield title + ' ' + ts + ' ' + json.dumps(data)
def serialize_json_pages(self, pages): yield json.dumps({'format': 'json-pages-1.0', 'title': 'All Pages'}) + "\n" for line in pages.values(): ts = timestamp_to_iso_date(line['timestamp']) title = line.get('title') data = {'url': line['url'], 'ts': ts} if title: data['title']= title if 'text' in line: data['text'] = line['text'] yield json.dumps(data) + "\n"
def create_revisit_record(self, original, url, redirect_url, timestamp): warc_headers = {} warc_headers['WARC-Date'] = timestamp_to_iso_date(timestamp) headers_list = [('Content-Length', '0'), ('Location', redirect_url)] http_headers = StatusAndHeaders('302 Temp Redirect', headers_list, protocol='HTTP/1.0') rec = self.writer.create_revisit_record(url, digest=original.rec_headers['WARC-Payload-Digest'], refers_to_uri=url, refers_to_date=original.rec_headers['WARC-Date'], warc_headers_dict=warc_headers, http_headers=http_headers) self.writer.write_record(rec)
def make_transclusion_metadata(self, writer, url, record): content_type = record.rec_headers.get('Content-Type') for url, timestamp, metadata in self.transclusion_serializer.find_transclusions( url, content_type): #if url.startswith('http://'): # url = url.replace('http://', 'metadata://') #elif url.startswith('https://'): # url = url.replace('https://', 'metadata://') embeds_url = 'urn:embeds:' + url content = json.dumps(metadata, indent=2, sort_keys=True).encode('utf-8') warc_date = timestamp_to_iso_date(timestamp) warc_headers = { 'WARC-Date': warc_date, 'WARC-Creation-Date': writer._make_warc_date() } warc_content_type = 'application/vnd.youtube-dl_formats+json' record = writer.create_warc_record( embeds_url, 'resource', payload=BytesIO(content), length=len(content), warc_content_type=warc_content_type, warc_headers_dict=warc_headers) logging.debug( 'Writing transclusion metadata at {0}'.format(embeds_url)) writer.write_record(record) self.count += 1 self.logger.debug('Writing "{0}" ({1}) @ "{2}" from "{3}"'.format( embeds_url, warc_content_type, warc_date, '-')) self.write_logfile({ 'file': '-', 'Record-Type': 'metadata', 'URL': embeds_url, 'timestamp': warc_date, })
def create_response_record(self, url, timestamp, text): payload = text.encode('utf-8') warc_headers = {} warc_headers['WARC-Date'] = timestamp_to_iso_date(timestamp) headers_list = [('Content-Length', str(len(payload)))] http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.0') rec = self.writer.create_warc_record(url, 'response', payload=BytesIO(payload), length=len(payload), http_headers=http_headers, warc_headers_dict=warc_headers) self.writer.write_record(rec) return rec
def create_revisit_record(self, url, timestamp, redirect_url, original_dt): warc_headers = {} warc_headers['WARC-Date'] = timestamp_to_iso_date(timestamp) headers_list = [('Content-Length', '0'), ('Location', redirect_url)] http_headers = StatusAndHeaders('302 Temp Redirect', headers_list, protocol='HTTP/1.0') rec = self.writer.create_revisit_record( url, digest='3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ', refers_to_uri=url, refers_to_date=original_dt, warc_headers_dict=warc_headers, http_headers=http_headers) self.writer.write_record(rec)
def generate_metadata(self, res, wacz): package_dict = {} package_dict["profile"] = "data-package" package_dict["resources"] = [] for i in range(0, len(wacz.infolist())): file = wacz.infolist()[i] package_dict["resources"].append({}) package_dict["resources"][i]["path"] = file.filename with wacz.open(file, "r") as myfile: content = myfile.read() package_dict["resources"][i]["hash"] = support_hash_file( self.hash_type, content) package_dict["resources"][i]["bytes"] = len(content) # set optional metadata desc = res.desc or self.desc title = res.title or self.title if title: package_dict["title"] = title if desc: package_dict["description"] = desc if self.main_url: package_dict["mainPageURL"] = self.main_url if self.main_ts: package_dict["mainPageDate"] = timestamp_to_iso_date( self.main_ts) if res.date: package_dict["mainPageDate"] = res.date package_dict["created"] = datetime.datetime.utcnow().strftime( "%Y-%m-%dT%H:%M:%SZ") package_dict["wacz_version"] = WACZ_VERSION package_dict["software"] = "py-wacz " + get_py_wacz_version() return json.dumps(package_dict, indent=2)
def create_record(cls, writer, url, text, timestamp): payload = text.encode('utf-8') headers_list = [('Content-Type', 'text/plain; charset="UTF-8"'), ('Content-Length', str(len(payload))) ] http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.0') warc_headers = {} if timestamp: warc_headers['WARC-Date'] = timestamp_to_iso_date(timestamp) rec = writer.create_warc_record(url, 'response', payload=BytesIO(payload), length=len(payload), http_headers=http_headers, warc_headers_dict=warc_headers) writer.write_record(rec)
def put_custom_record(self, environ, coll="$root"): """ When recording, PUT a custom WARC record to the specified collection (Available only when recording) :param dict environ: The WSGI environment dictionary for the request :param str coll: The name of the collection the record is to be served from """ chunks = [] while True: buff = environ["wsgi.input"].read() if not buff: break chunks.append(buff) data = b"".join(chunks) params = dict(parse_qsl(environ.get("QUERY_STRING"))) rec_type = "resource" headers = {"Content-Type": environ.get("CONTENT_TYPE", "text/plain")} target_uri = params.get("url") if not target_uri: return WbResponse.json_response({"error": "no url"}, status="400 Bad Request") timestamp = params.get("timestamp") if timestamp: headers["WARC-Date"] = timestamp_to_iso_date(timestamp) put_url = self.put_custom_record_path.format(url=target_uri, coll=coll, rec_type=rec_type) res = requests.put(put_url, headers=headers, data=data) res = res.json() return WbResponse.json_response(res)
def create_record(cls, writer, url, text, timestamp): payload = text.encode('utf-8') headers_list = [('Content-Type', 'text/plain; charset="UTF-8"'), ('Content-Length', str(len(payload)))] http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.0') warc_headers = {} if timestamp: warc_headers['WARC-Date'] = timestamp_to_iso_date(timestamp) rec = writer.create_warc_record(url, 'response', payload=BytesIO(payload), length=len(payload), http_headers=http_headers, warc_headers_dict=warc_headers) writer.write_record(rec)
def generate_metadata(self, res): desc = res.desc or self.desc title = res.title or self.title textIndex = PAGE_INDEX if res.text else '' data = {} if title: data['title'] = title if desc: data['desc'] = desc if textIndex: data['textIndex'] = textIndex data['pages'] = [{ 'title': page.get('title') or page.get('url'), 'date': timestamp_to_iso_date(page['timestamp']), 'url': page['url'] } for page in self.pages.values()] return yaml.dump(data)
def put_record(self, environ, coll, target_uri_format, rec_type, params, data): self.ensure_coll_exists(coll) headers = {'Content-Type': environ.get('CONTENT_TYPE', 'text/plain')} url = params.get('url') if not url: return WbResponse.json_response({'error': 'no url'}) timestamp = params.get('timestamp') if timestamp: headers['WARC-Date'] = timestamp_to_iso_date(timestamp) target_uri = target_uri_format.format(url=url) put_url = self.custom_record_path.format( url=target_uri, coll=coll, rec_type=rec_type ) res = requests.put(put_url, headers=headers, data=data) res = res.json() return WbResponse.json_response(res)
def _get_protocol_and_headers(self, headerline, parts): headers = [] if headerline.startswith('filedesc://'): rec_type = 'warcinfo' else: rec_type = 'response' parts[3] = 'application/http;msgtype=response' headers.append(('WARC-Type', rec_type)) headers.append(('WARC-Record-ID', StatusAndHeadersParser.make_warc_id())) for name, value in zip(self.headernames, parts): if name == 'WARC-Date': value = timestamp_to_iso_date(value) if rec_type == 'warcinfo' and name == 'WARC-Target-URI': name = 'WARC-Filename' value = value[len('filedesc://'):] headers.append((name, value)) return ('WARC/1.0', headers)