Exemple #1
0
def main () :
    usage =  "./app/python/arcdump.py -f <file.arc>  [-t <working_dir>]\n "\
             "\t-f    : valid ARC file name\n"\
             "\t[-t]  : temporary working directory (default './')"

    parser = OptionParser(usage)
    parser.add_option("-f", "--file", dest="filename",
                      help="read data from FILENAME")
    
    parser.add_option("-t", "--tempdir", dest="tmpdir",
                      help="Temporary working directory", default="./")

    (options, args) = parser.parse_args()

    if len (args) != 0 :
       parser.error(" Incorrect arguments")

    if (not (options.filename)) :
        parser.error(" You must give ARC file name")

    count = 0
    a = AFile (options.filename, arc.ARC_FILE_DETECT_COMPRESSION, options.tmpdir)

    if (not (a)) :
             print "ARC file  not found  "
             return
        
    while (a . hasMoreRecords () ) :
 
          ar = a . nextRecord ()
          if (not (ar)) :
             print "bad arc file"
             a . destroy ()
             return
          count = count +1

          print "Arc record number :", count, "\n"
          print "***************************************************************************************************************\n\n"

          print "Url:\t ", ar . getUrl ()
          print "CreationDate:\t ", ar . getCreationDate ()
          print "MimeType:\t ", ar . getMimeType ()
          print "IpAdress:\t ", ar . getIpAddress()
        
          print "***************************************************************************************************************\n\n"

          ar . destroy ()

    a . destroy ()
    return
Exemple #2
0
def convert(fname, outfname, tmpdir, cmode):
    a = AFile(fname, arc.ARC_FILE_DETECT_COMPRESSION, tmpdir)

    if (not (a)):
        print "ARC file not found "
        return

    if (cmode):
        cmode = warc.WARC_FILE_COMPRESSED_GZIP
    else:
        cmode = warc.WARC_FILE_UNCOMPRESSED

    w = WFile(outfname, 16 * 1024 * 1024 * 1024, warc.WARC_FILE_WRITER, cmode,
              tmpdir)

    if w == None:
        print "given temporary directory does not exist "
        a.destroy()
        return

    while (a.hasMoreRecords()):

        ar = a.nextRecord()

        if ar == None:
            print "bad ARC file"
            a.destroy()
            w.destroy()
            return

        wr = WRecord()

        if wr == None:
            print "can not create WARC record object"
            a.destroy()
            w.destroy()
            ar.destroy()
            return

        wr.setRecordType(warc.WARC_RESPONSE_RECORD)

        uri = ar.getUrl()
        wr.setTargetUri(uri, len(uri))

        date = ar.getCreationDate()
        wr.setDateFromArc(date, len(date))

        mime = ar.getMimeType()
        wr.setContentType(mime, len(mime))

        ip = ar.getIpAddress()
        wr.setIpAddress(ip, len(ip))

        s = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.localtime())
        sh = sha.new(uri + s)
        rid = sh.hexdigest()
        rid = "uuid:" + rid
        wr.setRecordId(rid, len(rid))

        if (ar.transferContent(wr, a)):
            print "Unable to pass content to the WRecord"
            a.destroy()
            w.destroy()
            ar.destroy()
            return

        if (w.storeRecord(wr)):
            print "failed to write WRecord"
            a.destroy()
            w.destroy()
            ar.destroy()
            return

        ar.destroy()
        wr.destroy()

    a.destroy()
    w.destroy()