Example #1
0
def mergeWarc():
    """
    Merge multiple WARC files into a single file, writing revisit records for
    items which occur multiple times
    """

    parser = argparse.ArgumentParser(
        description='Merge WARCs, reads filenames from stdin.')
    parser.add_argument('--verbose', '-v', action='store_true')
    parser.add_argument('output',
                        type=argparse.FileType('wb'),
                        help='Output WARC')

    args = parser.parse_args()
    loglevel = logging.DEBUG if args.verbose else logging.INFO
    logging.basicConfig(level=loglevel)

    unique = 0
    revisit = 0
    payloadMap = {}
    writer = WARCWriter(args.output, gzip=True)
    for l in sys.stdin:
        l = l.strip()
        with open(l, 'rb') as fd:
            for record in ArchiveIterator(fd):
                if record.rec_type in {'resource', 'response'}:
                    headers = record.rec_headers
                    rid = headers.get_header('WARC-Record-ID')
                    csum = headers.get_header('WARC-Payload-Digest')
                    dup = payloadMap.get(csum, None)
                    if dup is None:
                        payloadMap[csum] = {
                            'uri': headers.get_header('WARC-Target-URI'),
                            'id': rid,
                            'date': headers.get_header('WARC-Date')
                        }
                        unique += 1
                    else:
                        logging.debug('Record {} is duplicate of {}'.format(
                            rid, dup['id']))
                        record = writer.create_revisit_record(
                            dup['uri'], csum, dup['uri'], dup['date'])
                        record.rec_headers.add_header('WARC-Truncated',
                                                      'length')
                        record.rec_headers.add_header('WARC-Refers-To',
                                                      dup['id'])
                        revisit += 1
                else:
                    unique += 1
                writer.write_record(record)
    logging.info('Wrote {} unique records, {} revisits'.format(
        unique, revisit))
Example #2
0
def mergeWarc (files, output):
    # stats
    unique = 0
    revisit = 0
    uniqueLength = 0
    revisitLength = 0

    payloadMap = {}
    writer = WARCWriter (output, gzip=True)

    # Add an additional warcinfo record, describing the transformations. This
    # is not ideal, since
    #   “A ‘warcinfo’ record describes the records that
    #   follow it […] until next ‘warcinfo’”
    #   -- https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#warcinfo
    # A warcinfo record is expected at the beginning of every file. But it
    # might have written by a different software, so we don’t want to
    # strip/replace that information, but supplement it.
    warcinfo = {
            'software': getSoftwareInfo (),
            'tool': 'crocoite-merge', # not the name of the cli tool
            'parameters': {'inputs': files},
            }
    payload = BytesIO (json.dumps (warcinfo, indent=2).encode ('utf-8'))
    record = writer.create_warc_record ('', 'warcinfo',
            payload=payload,
            warc_headers_dict={'Content-Type': makeContentType (jsonMime, 'utf-8')})
    writer.write_record (record)

    for l in files:
        with open (l, 'rb') as fd:
            for record in ArchiveIterator (fd):
                if record.rec_type in {'resource', 'response'}:
                    headers = record.rec_headers
                    rid = headers.get_header('WARC-Record-ID')
                    csum = headers.get_header('WARC-Payload-Digest')
                    length = int (headers.get_header ('Content-Length'))
                    dup = payloadMap.get (csum, None)
                    if dup is None:
                        payloadMap[csum] = {'uri': headers.get_header('WARC-Target-URI'),
                                'id': rid, 'date': headers.get_header('WARC-Date')}
                        unique += 1
                        uniqueLength += length
                    else:
                        logging.debug (f'Record {rid} is duplicate of {dup["id"]}')
                        # Payload may be identical, but HTTP headers are
                        # (probably) not. Include them.
                        record = writer.create_revisit_record (
                                headers.get_header('WARC-Target-URI'), digest=csum,
                                refers_to_uri=dup['uri'], refers_to_date=dup['date'],
                                http_headers=record.http_headers)
                        record.rec_headers.add_header ('WARC-Truncated', 'length')
                        record.rec_headers.add_header ('WARC-Refers-To', dup['id'])
                        revisit += 1
                        revisitLength += length
                else:
                    unique += 1
                writer.write_record (record)
    json.dump (dict (
            unique=dict (records=unique, bytes=uniqueLength),
            revisit=dict (records=revisit, bytes=revisitLength),
            ratio=dict (
                    records=unique/(unique+revisit),
                    bytes=uniqueLength/(uniqueLength+revisitLength)
                    ),
            ),
            sys.stdout,
            cls=StrJsonEncoder)
    sys.stdout.write ('\n')