Exemple #1
0
def convert(fname, outfname, tmpdir, cmode):
    a = AFile(fname, arc.ARC_FILE_DETECT_COMPRESSION, tmpdir)

    if (not (a)):
        print "ARC file not found "
        return

    if (cmode):
        cmode = warc.WARC_FILE_COMPRESSED_GZIP
    else:
        cmode = warc.WARC_FILE_UNCOMPRESSED

    w = WFile(outfname, 16 * 1024 * 1024 * 1024, warc.WARC_FILE_WRITER, cmode,
              tmpdir)

    if w == None:
        print "given temporary directory does not exist "
        a.destroy()
        return

    while (a.hasMoreRecords()):

        ar = a.nextRecord()

        if ar == None:
            print "bad ARC file"
            a.destroy()
            w.destroy()
            return

        wr = WRecord()

        if wr == None:
            print "can not create WARC record object"
            a.destroy()
            w.destroy()
            ar.destroy()
            return

        wr.setRecordType(warc.WARC_RESPONSE_RECORD)

        uri = ar.getUrl()
        wr.setTargetUri(uri, len(uri))

        date = ar.getCreationDate()
        wr.setDateFromArc(date, len(date))

        mime = ar.getMimeType()
        wr.setContentType(mime, len(mime))

        ip = ar.getIpAddress()
        wr.setIpAddress(ip, len(ip))

        s = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.localtime())
        sh = sha.new(uri + s)
        rid = sh.hexdigest()
        rid = "uuid:" + rid
        wr.setRecordId(rid, len(rid))

        if (ar.transferContent(wr, a)):
            print "Unable to pass content to the WRecord"
            a.destroy()
            w.destroy()
            ar.destroy()
            return

        if (w.storeRecord(wr)):
            print "failed to write WRecord"
            a.destroy()
            w.destroy()
            ar.destroy()
            return

        ar.destroy()
        wr.destroy()

    a.destroy()
    w.destroy()