def addToWarc (fname, wfile, uri, mime, date, ip, cmode, maxsize, tmpdir): ## creating a new record ## ## don't forget to check return values of each functions ## w = WFile(wfile, maxsize, warc.WARC_FILE_WRITER, cmode, tmpdir) if w == None: print "Couldn't create a WARC File object" return r = WRecord() if r == None: w.destroy () print "Couldn't create an empty WARC record object" return r . setRecordType(warc.WARC_RESOURCE_RECORD) r . setTargetUri(uri, len(uri)) r . setDate(date, len(date)) r . setContentType(mime, len(mime)) # use your "unique identifier" function here #s = time.strftime ("%Y-%m-%dT%H:%M:%SZ", time.localtime()) s = "%Y-%m-%dT%H:%M:%SZ" sh = sha.new(uri + s) rid = sh.hexdigest() rid = "uuid:" + rid r . setRecordId(rid, len(rid)) r . setIpAddress(ip, len(ip)) r . setContentFromFileName(fname) w . storeRecord(r) r . destroy() w . destroy()
def addToWarc(w, uri, data, f, mime): o = urlparse.urlparse(uri) ip = socket.gethostbyname(o.hostname) r = WRecord() r.setRecordType(warc.WARC_RESOURCE_RECORD) r.setTargetUri(uri, len(uri)) #warc-tools can't handle the updated date in the format '2009-04-07T05:12:50+02:00' #r.setDate(str(f.feed.updated), len(str(f.feed.updated))) t = f.feed.updated_parsed dt = datetime.datetime(t.tm_year, t.tm_mon, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec) updated = dt.strftime("%Y-%m-%dT%H:%M:%SZ") r.setDate(updated, len(updated)) r.setContentType(mime, len(mime)) r.setRecordId(str(f.feed.id), len(str(f.feed.id))) r.setIpAddress(ip, len(ip)) r.setContentFromString(data, len(data)) w.storeRecord(r) r.destroy()
def addToWarc(w, uri, mime, date, ip): ## don't forget to check return values of each functions ## r = WRecord() # change the record type if you want r.setRecordType(warc.WARC_RESOURCE_RECORD) r.setTargetUri(uri, len(uri)) r.setDate(date, len(date)) r.setContentType(mime, len(mime)) # use your "unique identifier" function here s = time.strftime("%a, %Y-%m-%dT%H:%M:%SZ", time.localtime()) sh = sha.new(uri + s) rid = sh.hexdigest() rid = "uuid:" + rid r.setRecordId(rid, len(rid)) r.setIpAddress(ip, len(ip)) r.setContentFromFileName(tmpfile) w.storeRecord(r) r.destroy()
def nextRecord(self): r = WRecord() r.external_set(self, warc.WFile_nextRecord(self.me)) return r
def convert(fname, outfname, tmpdir, cmode): a = AFile(fname, arc.ARC_FILE_DETECT_COMPRESSION, tmpdir) if (not (a)): print "ARC file not found " return if (cmode): cmode = warc.WARC_FILE_COMPRESSED_GZIP else: cmode = warc.WARC_FILE_UNCOMPRESSED w = WFile(outfname, 16 * 1024 * 1024 * 1024, warc.WARC_FILE_WRITER, cmode, tmpdir) if w == None: print "given temporary directory does not exist " a.destroy() return while (a.hasMoreRecords()): ar = a.nextRecord() if ar == None: print "bad ARC file" a.destroy() w.destroy() return wr = WRecord() if wr == None: print "can not create WARC record object" a.destroy() w.destroy() ar.destroy() return wr.setRecordType(warc.WARC_RESPONSE_RECORD) uri = ar.getUrl() wr.setTargetUri(uri, len(uri)) date = ar.getCreationDate() wr.setDateFromArc(date, len(date)) mime = ar.getMimeType() wr.setContentType(mime, len(mime)) ip = ar.getIpAddress() wr.setIpAddress(ip, len(ip)) s = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.localtime()) sh = sha.new(uri + s) rid = sh.hexdigest() rid = "uuid:" + rid wr.setRecordId(rid, len(rid)) if (ar.transferContent(wr, a)): print "Unable to pass content to the WRecord" a.destroy() w.destroy() ar.destroy() return if (w.storeRecord(wr)): print "failed to write WRecord" a.destroy() w.destroy() ar.destroy() return ar.destroy() wr.destroy() a.destroy() w.destroy()