def addToWarc(w, uri, mime, date, ip): ## don't forget to check return values of each functions ## r = WRecord() # change the record type if you want r.setRecordType(warc.WARC_RESOURCE_RECORD) r.setTargetUri(uri, len(uri)) r.setDate(date, len(date)) r.setContentType(mime, len(mime)) # use your "unique identifier" function here s = time.strftime("%a, %Y-%m-%dT%H:%M:%SZ", time.localtime()) sh = sha.new(uri + s) rid = sh.hexdigest() rid = "uuid:" + rid r.setRecordId(rid, len(rid)) r.setIpAddress(ip, len(ip)) r.setContentFromFileName(tmpfile) w.storeRecord(r) r.destroy()
def addToWarc(w, uri, data, f, mime): o = urlparse.urlparse(uri) ip = socket.gethostbyname(o.hostname) r = WRecord() r.setRecordType(warc.WARC_RESOURCE_RECORD) r.setTargetUri(uri, len(uri)) #warc-tools can't handle the updated date in the format '2009-04-07T05:12:50+02:00' #r.setDate(str(f.feed.updated), len(str(f.feed.updated))) t = f.feed.updated_parsed dt = datetime.datetime(t.tm_year, t.tm_mon, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec) updated = dt.strftime("%Y-%m-%dT%H:%M:%SZ") r.setDate(updated, len(updated)) r.setContentType(mime, len(mime)) r.setRecordId(str(f.feed.id), len(str(f.feed.id))) r.setIpAddress(ip, len(ip)) r.setContentFromString(data, len(data)) w.storeRecord(r) r.destroy()