def write_warcinfo_record(self, warc): """Writes the initial warcinfo record.""" headers = [ (WarcRecord.TYPE, WarcRecord.WARCINFO), (WarcRecord.DATE, warc_datetime_str(datetime.now())), (WarcRecord.ID, "<urn:uuid:%s>" % uuid.uuid1()), ] data = "software=%s\nhostname=%s\nip=%s" % (self.software, self.hostname, self.ip) if self.description is not None: data += "\ndescription=%s" % self.description record = WarcRecord(headers=headers, content=("application/warc-fields", data)) record.write_to(warc, gzip=self.gzip) warc.flush()
def warcinfo_record(warc_filename): """Return warcinfo WarcRecord. Required to write in the beginning of a WARC file. """ warc_date = warc_datetime_str(datetime.utcnow()) metadata = "\r\n".join(( "format: WARC File Format 1.0", "conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf" )) return WarcRecord(headers=[ (WarcRecord.TYPE, WarcRecord.WARCINFO), (WarcRecord.CONTENT_TYPE, b'application/warc-fields'), (WarcRecord.ID, warc_uuid(metadata + warc_date)), (WarcRecord.DATE, warc_date), (WarcRecord.FILENAME, warc_filename) ], content=(b'application/warc-fields', metadata + "\r\n"), version=b"WARC/1.0")
def write_record(self, headers, mime, data): """Writes a WARC record. Arguments: headers -- Array of WARC headers. mime -- MIME type of the data. data -- the data block. """ record = WarcRecord(headers=headers, content=(mime, data)) logger.debug("Getting WARC: %s" % str(self.warcs.keys())) name = self.pool.get() logger.debug("Writing to: %s" % name) fh = self.warcs[name] record.write_to(fh, gzip=self.gzip) fh.flush() if not self.warc_reached_max_size(name): logger.debug("%s undersized; adding back to the pool." % name) self.pool.put(name)
def create_metadata_record_bytes( url='http://example.com/', content_type='image/png', date='2016-08-03T10:49:41Z', content=b'', include_block_digest=True): """Build WARC metadata record bits.""" headers = { WarcRecord.TYPE: WarcRecord.METADATA, WarcRecord.URL: url.encode('utf-8'), WarcRecord.CONTENT_TYPE: content_type.encode('utf-8'), WarcRecord.DATE: date.encode('utf-8') } if include_block_digest: hasher = hashlib.sha1(content) block_digest = base64.b32encode(hasher.digest()) headers[WarcRecord.BLOCK_DIGEST] = b'sha1:' + block_digest # XXX - I wish I could use WarcRecord. Current implementation of # WarcRecord.write_to() ignores Warc-Block-Digest passed and writes out # hex-encoded SHA256 calculated from the content. out = io.BytesIO() if False: rec = WarcRecord( headers=headers.items(), content=(content_type.encode('utf-8'), content) ) out = io.BytesIO() rec.write_to(out, gzip=True) return out.getvalue() else: z = GzipFile(fileobj=out, mode='wb') z.write(b'WARC/1.0\r\n') for k, v in headers.items(): z.write(b''.join((k, b': ', v, b'\r\n'))) z.write('Content-Length: {}\r\n'.format(len(content)).encode('ascii')) z.write(b'\r\n') z.write(content) z.write(b'\r\n\r\n') z.flush() z.close() return out.getvalue()
def _init_file(self): warcinfo_headers = [ (WarcRecord.TYPE, WarcRecord.WARCINFO), (WarcRecord.ID, WarcRecord.random_warc_uuid()), (WarcRecord.DATE, warc.warc_datetime_str(datetime.utcnow())), (WarcRecord.FILENAME, os.path.basename(self._file_name)), (Warc.MAIN_URL, self._main_url), ] warcinfo_fields = "\r\n".join([ "software: bardo", "format: WARC File Format 1.0", "conformsTo: " + CONFORMS_TO, "robots: unknown", ]) warcinfo_content = ("application/warc-fields", warcinfo_fields) warcinfo_record = WarcRecord(headers=warcinfo_headers, \ content=warcinfo_content) self.write_record(warcinfo_record)
def tweet_warc_record(tweet_json): """Parse Tweet JSON and return WarcRecord. """ try: tweet = json.loads(tweet_json) # skip deleted tweet if 'user' not in tweet: return url = "https://twitter.com/%s/status/%s" % ( tweet['user']['screen_name'], tweet['id']) except Exception as ex: logging.error('error in tweet_warc_record', exc_info=1) return None warc_date = warc_datetime_str( datetime.utcfromtimestamp(float(tweet['timestamp_ms']) / 1000.0)) return WarcRecord(headers=[(WarcRecord.TYPE, WarcRecord.RESOURCE), (WarcRecord.CONTENT_TYPE, b'application/json'), (WarcRecord.ID, warc_uuid(url + warc_date)), (WarcRecord.URL, url), (WarcRecord.DATE, warc_date)], content=(b'application/json', tweet_json + "\r\n"), version=b"WARC/1.0")
def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) out = sys.stdout if options.output: out = open(options.output, 'ab') if options.output.endswith('.gz'): options.gzip = True if len(input_files) < 1: parser.error("no imput warc file(s)") for name in input_files: fh = ArcRecord.open_archive(name, gzip="auto") filedesc = None warcinfo_id = None for record in fh: version = "WARC/1.0" warc_id = make_warc_uuid(record.url + record.date) headers = [ (WarcRecord.ID, warc_id), ] if record.date: date = datetime.datetime.strptime(record.date, '%Y%m%d%H%M%S') headers.append((WarcRecord.DATE, warc_datetime_str(date))) if record.type == 'filedesc': warcinfo_id = warc_id warcinfo_headers = list(headers) warcinfo_headers.append((WarcRecord.FILENAME, record.url[11:])) warcinfo_headers.append((WarcRecord.TYPE, WarcRecord.WARCINFO)) warcinfo_content = ('application/warc-fields', 'software: hanzo.arc2warc\r\n') warcrecord = WarcRecord(headers=warcinfo_headers, content=warcinfo_content, version=version) warcrecord.write_to(out, gzip=options.gzip) warc_id = make_warc_uuid(record.url + record.date + "-meta") warcmeta_headers = [ (WarcRecord.TYPE, WarcRecord.METADATA), (WarcRecord.CONCURRENT_TO, warcinfo_id), (WarcRecord.ID, warc_id), (WarcRecord.URL, record.url), (WarcRecord.DATE, warcrecord.date), (WarcRecord.WARCINFO_ID, warcinfo_id), ] warcmeta_content = ('application/arc', record.raw()) warcrecord = WarcRecord(headers=warcmeta_headers, content=warcmeta_content, version=version) warcrecord.write_to(out, gzip=options.gzip) else: content_type, content = record.content if record.url.startswith('http'): # don't promote content-types for http urls, # they contain headers in the body. content_type = "application/http;msgtype=response" headers.extend([ (WarcRecord.TYPE, WarcRecord.RESPONSE), (WarcRecord.URL, record.url), (WarcRecord.WARCINFO_ID, warcinfo_id), ]) warcrecord = WarcRecord(headers=headers, content=(content_type, content), version=version) warcrecord.write_to(out, gzip=options.gzip) fh.close() return 0