def main () : usage = "./app/python/arcdump.py -f <file.arc> [-t <working_dir>]\n "\ "\t-f : valid ARC file name\n"\ "\t[-t] : temporary working directory (default './')" parser = OptionParser(usage) parser.add_option("-f", "--file", dest="filename", help="read data from FILENAME") parser.add_option("-t", "--tempdir", dest="tmpdir", help="Temporary working directory", default="./") (options, args) = parser.parse_args() if len (args) != 0 : parser.error(" Incorrect arguments") if (not (options.filename)) : parser.error(" You must give ARC file name") count = 0 a = AFile (options.filename, arc.ARC_FILE_DETECT_COMPRESSION, options.tmpdir) if (not (a)) : print "ARC file not found " return while (a . hasMoreRecords () ) : ar = a . nextRecord () if (not (ar)) : print "bad arc file" a . destroy () return count = count +1 print "Arc record number :", count, "\n" print "***************************************************************************************************************\n\n" print "Url:\t ", ar . getUrl () print "CreationDate:\t ", ar . getCreationDate () print "MimeType:\t ", ar . getMimeType () print "IpAdress:\t ", ar . getIpAddress() print "***************************************************************************************************************\n\n" ar . destroy () a . destroy () return
def convert(fname, outfname, tmpdir, cmode): a = AFile(fname, arc.ARC_FILE_DETECT_COMPRESSION, tmpdir) if (not (a)): print "ARC file not found " return if (cmode): cmode = warc.WARC_FILE_COMPRESSED_GZIP else: cmode = warc.WARC_FILE_UNCOMPRESSED w = WFile(outfname, 16 * 1024 * 1024 * 1024, warc.WARC_FILE_WRITER, cmode, tmpdir) if w == None: print "given temporary directory does not exist " a.destroy() return while (a.hasMoreRecords()): ar = a.nextRecord() if ar == None: print "bad ARC file" a.destroy() w.destroy() return wr = WRecord() if wr == None: print "can not create WARC record object" a.destroy() w.destroy() ar.destroy() return wr.setRecordType(warc.WARC_RESPONSE_RECORD) uri = ar.getUrl() wr.setTargetUri(uri, len(uri)) date = ar.getCreationDate() wr.setDateFromArc(date, len(date)) mime = ar.getMimeType() wr.setContentType(mime, len(mime)) ip = ar.getIpAddress() wr.setIpAddress(ip, len(ip)) s = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.localtime()) sh = sha.new(uri + s) rid = sh.hexdigest() rid = "uuid:" + rid wr.setRecordId(rid, len(rid)) if (ar.transferContent(wr, a)): print "Unable to pass content to the WRecord" a.destroy() w.destroy() ar.destroy() return if (w.storeRecord(wr)): print "failed to write WRecord" a.destroy() w.destroy() ar.destroy() return ar.destroy() wr.destroy() a.destroy() w.destroy()