Example #1
0
def addToWarc (fname, wfile, uri, mime, date, ip, cmode, maxsize, tmpdir):
    ##  creating a new record  ##
    ##  don't forget to check return values of each functions  ##

        w = WFile(wfile, maxsize, warc.WARC_FILE_WRITER, cmode, tmpdir)
        if w == None:
                print "Couldn't create a WARC File object"
                return
    
	r = WRecord()
        if r == None:
                w.destroy ()
                print "Couldn't create an empty WARC record object"
                return   

	r . setRecordType(warc.WARC_RESOURCE_RECORD)
       	r . setTargetUri(uri, len(uri))	
       	r . setDate(date, len(date))
       	r . setContentType(mime, len(mime))

        # use your "unique identifier" function here
        #s = time.strftime ("%Y-%m-%dT%H:%M:%SZ", time.localtime())
        s = "%Y-%m-%dT%H:%M:%SZ"
	sh = sha.new(uri + s)
	rid = sh.hexdigest()
	rid = "uuid:" + rid
	r . setRecordId(rid, len(rid))
	r . setIpAddress(ip, len(ip))

	r . setContentFromFileName(fname)

	w . storeRecord(r)
	r . destroy()

	w . destroy()
Example #2
0
def addToWarc(w, uri, data, f, mime):
    o  = urlparse.urlparse(uri)
    ip = socket.gethostbyname(o.hostname)

    r = WRecord()
    r.setRecordType(warc.WARC_RESOURCE_RECORD)
    r.setTargetUri(uri, len(uri)) 

    #warc-tools can't handle the updated date in the format '2009-04-07T05:12:50+02:00'   
    #r.setDate(str(f.feed.updated), len(str(f.feed.updated)))
    t       = f.feed.updated_parsed
    dt      = datetime.datetime(t.tm_year, t.tm_mon, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec)
    updated = dt.strftime("%Y-%m-%dT%H:%M:%SZ")
    r.setDate(updated, len(updated))
    
    r.setContentType(mime, len(mime))
    r.setRecordId(str(f.feed.id), len(str(f.feed.id)))
    r.setIpAddress(ip, len(ip))
    r.setContentFromString(data, len(data))
    
    w.storeRecord(r)
    r.destroy()
Example #3
0
def addToWarc(w, uri, mime, date, ip):
    ##  don't forget to check return values of each functions  ##
    r = WRecord()

    # change the record type if you want
    r.setRecordType(warc.WARC_RESOURCE_RECORD)
    r.setTargetUri(uri, len(uri))
    r.setDate(date, len(date))
    r.setContentType(mime, len(mime))
    # use your "unique identifier" function here
    s = time.strftime("%a, %Y-%m-%dT%H:%M:%SZ", time.localtime())
    sh = sha.new(uri + s)
    rid = sh.hexdigest()
    rid = "uuid:" + rid
    r.setRecordId(rid, len(rid))
    r.setIpAddress(ip, len(ip))
    r.setContentFromFileName(tmpfile)

    w.storeRecord(r)
    r.destroy()
Example #4
0
 def nextRecord(self):
     r = WRecord()
     r.external_set(self, warc.WFile_nextRecord(self.me))
     return r
Example #5
0
def addToWarc(w, uri, mime, date, ip):
    ##  don't forget to check return values of each functions  ##
    r = WRecord()

    # change the record type if you want
    r.setRecordType(warc.WARC_RESOURCE_RECORD)
    r.setTargetUri(uri, len(uri))
    r.setDate(date, len(date))
    r.setContentType(mime, len(mime))
    # use your "unique identifier" function here
    s = time.strftime("%a, %Y-%m-%dT%H:%M:%SZ", time.localtime())
    sh = sha.new(uri + s)
    rid = sh.hexdigest()
    rid = "uuid:" + rid
    r.setRecordId(rid, len(rid))
    r.setIpAddress(ip, len(ip))
    r.setContentFromFileName(tmpfile)

    w.storeRecord(r)
    r.destroy()
Example #6
0
def convert(fname, outfname, tmpdir, cmode):
    a = AFile(fname, arc.ARC_FILE_DETECT_COMPRESSION, tmpdir)

    if (not (a)):
        print "ARC file not found "
        return

    if (cmode):
        cmode = warc.WARC_FILE_COMPRESSED_GZIP
    else:
        cmode = warc.WARC_FILE_UNCOMPRESSED

    w = WFile(outfname, 16 * 1024 * 1024 * 1024, warc.WARC_FILE_WRITER, cmode,
              tmpdir)

    if w == None:
        print "given temporary directory does not exist "
        a.destroy()
        return

    while (a.hasMoreRecords()):

        ar = a.nextRecord()

        if ar == None:
            print "bad ARC file"
            a.destroy()
            w.destroy()
            return

        wr = WRecord()

        if wr == None:
            print "can not create WARC record object"
            a.destroy()
            w.destroy()
            ar.destroy()
            return

        wr.setRecordType(warc.WARC_RESPONSE_RECORD)

        uri = ar.getUrl()
        wr.setTargetUri(uri, len(uri))

        date = ar.getCreationDate()
        wr.setDateFromArc(date, len(date))

        mime = ar.getMimeType()
        wr.setContentType(mime, len(mime))

        ip = ar.getIpAddress()
        wr.setIpAddress(ip, len(ip))

        s = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.localtime())
        sh = sha.new(uri + s)
        rid = sh.hexdigest()
        rid = "uuid:" + rid
        wr.setRecordId(rid, len(rid))

        if (ar.transferContent(wr, a)):
            print "Unable to pass content to the WRecord"
            a.destroy()
            w.destroy()
            ar.destroy()
            return

        if (w.storeRecord(wr)):
            print "failed to write WRecord"
            a.destroy()
            w.destroy()
            ar.destroy()
            return

        ar.destroy()
        wr.destroy()

    a.destroy()
    w.destroy()