def mergeWarc(): """ Merge multiple WARC files into a single file, writing revisit records for items which occur multiple times """ parser = argparse.ArgumentParser( description='Merge WARCs, reads filenames from stdin.') parser.add_argument('--verbose', '-v', action='store_true') parser.add_argument('output', type=argparse.FileType('wb'), help='Output WARC') args = parser.parse_args() loglevel = logging.DEBUG if args.verbose else logging.INFO logging.basicConfig(level=loglevel) unique = 0 revisit = 0 payloadMap = {} writer = WARCWriter(args.output, gzip=True) for l in sys.stdin: l = l.strip() with open(l, 'rb') as fd: for record in ArchiveIterator(fd): if record.rec_type in {'resource', 'response'}: headers = record.rec_headers rid = headers.get_header('WARC-Record-ID') csum = headers.get_header('WARC-Payload-Digest') dup = payloadMap.get(csum, None) if dup is None: payloadMap[csum] = { 'uri': headers.get_header('WARC-Target-URI'), 'id': rid, 'date': headers.get_header('WARC-Date') } unique += 1 else: logging.debug('Record {} is duplicate of {}'.format( rid, dup['id'])) record = writer.create_revisit_record( dup['uri'], csum, dup['uri'], dup['date']) record.rec_headers.add_header('WARC-Truncated', 'length') record.rec_headers.add_header('WARC-Refers-To', dup['id']) revisit += 1 else: unique += 1 writer.write_record(record) logging.info('Wrote {} unique records, {} revisits'.format( unique, revisit))
def mergeWarc (files, output): # stats unique = 0 revisit = 0 uniqueLength = 0 revisitLength = 0 payloadMap = {} writer = WARCWriter (output, gzip=True) # Add an additional warcinfo record, describing the transformations. This # is not ideal, since # “A ‘warcinfo’ record describes the records that # follow it […] until next ‘warcinfo’” # -- https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#warcinfo # A warcinfo record is expected at the beginning of every file. But it # might have written by a different software, so we don’t want to # strip/replace that information, but supplement it. warcinfo = { 'software': getSoftwareInfo (), 'tool': 'crocoite-merge', # not the name of the cli tool 'parameters': {'inputs': files}, } payload = BytesIO (json.dumps (warcinfo, indent=2).encode ('utf-8')) record = writer.create_warc_record ('', 'warcinfo', payload=payload, warc_headers_dict={'Content-Type': makeContentType (jsonMime, 'utf-8')}) writer.write_record (record) for l in files: with open (l, 'rb') as fd: for record in ArchiveIterator (fd): if record.rec_type in {'resource', 'response'}: headers = record.rec_headers rid = headers.get_header('WARC-Record-ID') csum = headers.get_header('WARC-Payload-Digest') length = int (headers.get_header ('Content-Length')) dup = payloadMap.get (csum, None) if dup is None: payloadMap[csum] = {'uri': headers.get_header('WARC-Target-URI'), 'id': rid, 'date': headers.get_header('WARC-Date')} unique += 1 uniqueLength += length else: logging.debug (f'Record {rid} is duplicate of {dup["id"]}') # Payload may be identical, but HTTP headers are # (probably) not. Include them. record = writer.create_revisit_record ( headers.get_header('WARC-Target-URI'), digest=csum, refers_to_uri=dup['uri'], refers_to_date=dup['date'], http_headers=record.http_headers) record.rec_headers.add_header ('WARC-Truncated', 'length') record.rec_headers.add_header ('WARC-Refers-To', dup['id']) revisit += 1 revisitLength += length else: unique += 1 writer.write_record (record) json.dump (dict ( unique=dict (records=unique, bytes=uniqueLength), revisit=dict (records=revisit, bytes=revisitLength), ratio=dict ( records=unique/(unique+revisit), bytes=uniqueLength/(uniqueLength+revisitLength) ), ), sys.stdout, cls=StrJsonEncoder) sys.stdout.write ('\n')