def addToWarc (fname, wfile, uri, mime, date, ip, cmode, maxsize, tmpdir): ## creating a new record ## ## don't forget to check return values of each functions ## w = WFile(wfile, maxsize, warc.WARC_FILE_WRITER, cmode, tmpdir) if w == None: print "Couldn't create a WARC File object" return r = WRecord() if r == None: w.destroy () print "Couldn't create an empty WARC record object" return r . setRecordType(warc.WARC_RESOURCE_RECORD) r . setTargetUri(uri, len(uri)) r . setDate(date, len(date)) r . setContentType(mime, len(mime)) # use your "unique identifier" function here #s = time.strftime ("%Y-%m-%dT%H:%M:%SZ", time.localtime()) s = "%Y-%m-%dT%H:%M:%SZ" sh = sha.new(uri + s) rid = sh.hexdigest() rid = "uuid:" + rid r . setRecordId(rid, len(rid)) r . setIpAddress(ip, len(ip)) r . setContentFromFileName(fname) w . storeRecord(r) r . destroy() w . destroy()
def main(): global debug global tmpdir global tmpfile usage = ( " Download list of URLs and store them in WARC container \n\n" + sys.argv[0] + " -f <urls_list> -o <warc_file> [-s <tmpfile>] [-t <tmpdir>] [-q]" ) parser = OptionParser(usage) parser.add_option("-f", "--file", dest="input", type="string", help="file name containing a list of URLs") parser.add_option("-o", "--output", dest="out", type="string", help="WARC output file name to store documents") parser.add_option("-s", "--tmpfile", dest="tmpfile", type="string", help='temporary file (default ".tmpfile")') parser.add_option("-t", "--tmpdir", dest="tmpdir", type="string", help='temporary directory (default ".")') parser.add_option("-q", "--quiet", dest="quiet", action="store_true", help="quiet, no debug") (options, args) = parser.parse_args() if options.tmpfile: tmpfile = options.tmpfile if options.quiet: debug = False if options.input: urls = open(options.input).readlines() else: urls = sys.stdin.readlines() if options.tmpdir: tmpdir = options.tmpdir if options.out == None: logger("-o <warc_file> option is mandatory\n") sys.exit(1) # open the WARC file for writing cmode = warc.WARC_FILE_COMPRESSED_GZIP maxsize = 600 * 1024 * 1024 w = WFile(options.out, maxsize, warc.WARC_FILE_WRITER, cmode, tmpdir) # Make a queue with (url, filename) tuples for url in urls: url = url.strip() if not url or url[0] == "#": continue # fetch the RL effective_url, mime, date, ip, status, redirect = fetcher(url) addToWarc(w, effective_url, mime, date, ip) logger("%-30s %-3d %-3d %-30s\n" % (url, status, redirect, effective_url)) os.remove(tmpfile) w.destroy()
def indexWarc(warcFileName): tempdir = tempfile.mkdtemp(prefix='opds-crawler-') print 'created tempdir ' + tempdir w = WFile(warcFileName, config['max_warc_size'], warc.WARC_FILE_READER, warc.WARC_FILE_DETECT_COMPRESSION, tempdir) assert w while (w.hasMoreRecords()): r = w.nextRecord() if None == r: w.destroy() print "bad record.. bailing!" return url = r.getTargetUri() print 'processing ' + url b = WBloc(w, r, False, 64 * 1024) content = '' while True: buf = b.getNext() if buf: content += buf #sys.stdout.write(buf) else: break if 'application/atom+xml' == r.getContentType(): ingestor = bookserver.catalog.ingest.OpdsToCatalog(content, url) c = ingestor.getCatalog() provider = getProvider(url) renderer = bookserver.catalog.output.CatalogToSolr(c, provider) str = renderer.toString() solr_import_xml = tempdir + "/solr_import.xml" f = open(solr_import_xml, 'w') f.write(str) f.close() command = """/solr/example/exampledocs/post.sh '%s'""" % ( solr_import_xml) (ret, out) = commands.getstatusoutput(command) if -1 == out.find('<int name="status">0</int>'): print out assert 0 == ret os.unlink(solr_import_xml) b.destroy() r.destroy() os.rmdir(tempdir) w.destroy()
def indexWarc(warcFileName): tempdir = tempfile.mkdtemp(prefix='opds-crawler-') print 'created tempdir ' + tempdir w = WFile (warcFileName, config['max_warc_size'], warc.WARC_FILE_READER, warc.WARC_FILE_DETECT_COMPRESSION, tempdir) assert w while ( w.hasMoreRecords() ) : r = w.nextRecord() if None == r: w.destroy () print "bad record.. bailing!" return url = r.getTargetUri() print 'processing ' + url b = WBloc (w, r, False, 64 * 1024) content = '' while True: buf = b.getNext() if buf: content += buf #sys.stdout.write(buf) else: break if 'application/atom+xml' == r.getContentType(): ingestor = bookserver.catalog.ingest.OpdsToCatalog(content, url) c = ingestor.getCatalog() provider = getProvider(url) renderer = bookserver.catalog.output.CatalogToSolr(c, provider) str = renderer.toString() solr_import_xml = tempdir + "/solr_import.xml" f = open(solr_import_xml, 'w') f.write(str) f.close() command = """/solr/example/exampledocs/post.sh '%s'""" % (solr_import_xml) (ret, out) = commands.getstatusoutput(command) if -1 == out.find('<int name="status">0</int>'): print out assert 0 == ret os.unlink(solr_import_xml) b.destroy() r.destroy() os.rmdir(tempdir) w.destroy()
def indexWarc( indexin , fname , tempdir ): f = open( indexin , 'a' ) w = WFile ( fname , CONSTANT , warc.WARC_FILE_READER, warc.WARC_FILE_DETECT_COMPRESSION, tempdir ) if not w : return None while ( w.hasMoreRecords() ) : r = w.nextRecord() rect = getRecTypeName( r.getRecordType() ) if rect == 'WARC_RESPONSE_RECORD' or rect == 'WARC_RESOURCE_RECORD': headers = getHeaders( w , r ) try: if 'status' in headers: status = int( headers['status'] ) else: status = 200 except: status = 200 if 'content-type' in headers: contenttype= headers['content-type'].split(';')[0] else: contenttype='text/plain' # fix this if status >300 and status < 400 and False: redirect = '' else: redirect = '-' kp = keypath( r.getTargetUri() ) if len( kp ) < 400: f.write( '%s %s %s %u %u %s %s warc\n' % ( kp , getTimestampFromWarcDate( r.getDate() ) , contenttype, 200, r.getOffset() , redirect , fname )) r.destroy() w.destroy( ) f.close()
def createNewWarc(domain, domain_warc_dir, tempdir, crawlDateTime): #Name the warc file based on the domain and the date of the last update date #Since this is a new warc, we will use 01-01-1970 as update date. It will #get renamed with the crawl is finished to whatever is the lastest update #date in the feed ### We are now creating a new warc everytime, instead of adding to old ones #warcDateTime = datetime.datetime(1970, 1, 1, 0, 0, 0) warcDateTime = crawlDateTime warcFileName = '%s/%s_%s_warc.gz' % (domain_warc_dir, domain, warcDateTime.isoformat()) print 'creating new warc file ' + warcFileName cmode = warc.WARC_FILE_COMPRESSED_GZIP w = WFile(warcFileName, config['max_warc_size'], warc.WARC_FILE_WRITER, cmode, tempdir) return w, warcFileName, warcDateTime
def getLatestWarc(domain_warc_dir, tempdir): warcs = sorted(glob.glob(domain_warc_dir + '/*_warc.gz')) if [] == warcs: print 'No warc file found in ' + domain_warc_dir return None, None, None else: warcFileName = warcs[-1] m = re.match(r"(\S+)_(\S+)_warc.gz", warcFileName) assert None != m isodate = xml.utils.iso8601.parse(m.group(2) + '+00:00') warcDateTime = datetime.datetime.utcfromtimestamp(isodate) cmode = warc.WARC_FILE_COMPRESSED_GZIP w = WFile(warcFileName, config['max_warc_size'], warc.WARC_FILE_WRITER, cmode, tempdir) return w, warcFileName, warcDateTime
def getRecord(warcname, offset, tempdir='.'): w = WFile(warcname, CONSTANT, warc.WARC_FILE_READER, warc.WARC_FILE_DETECT_COMPRESSION, tempdir) w.seek(int(offset)) rec = w.nextRecord() b = WBloc(w, rec, False, BLOCKSIZE) dat = '' while True: nd = b.getNext() if nd: dat += nd else: break headers = {} if len(dat) > 0: bits = dat.split('\r\n\r\n') content = "\r\n\r\n".join(bits[1:]) header = bits[0].split('\r\n') for line in header: line = line.strip() if len(line.strip()) == 0: break else: if ':' in line: bits = line.split(':') headers[bits[0]] = ''.join(bits[1:]) elif line.startswith('HTTP'): parts = line.split(' ') try: headers['protocol'] = parts[0] headers['status'] = parts[1] headers['code'] = parts[2] except: pass b.destroy() rec.destroy() w.destroy() return (headers, content) return (w, rec)
def getRecord( warcname , offset , tempdir ='.' ): w = WFile ( warcname , CONSTANT , warc.WARC_FILE_READER, warc.WARC_FILE_DETECT_COMPRESSION, tempdir ) w.seek( int(offset )) rec = w.nextRecord() b = WBloc( w , rec , False , BLOCKSIZE ) dat = '' while True: nd = b.getNext() if nd: dat += nd else: break headers = {} if len(dat) > 0 : bits = dat.split( '\r\n\r\n' ) content = "\r\n\r\n".join( bits[1:] ) header = bits[0].split('\r\n' ) for line in header: line = line.strip() if len( line.strip() ) == 0: break else: if ':' in line: bits = line.split(':') headers[bits[0] ] = ''.join( bits[1:] ) elif line.startswith('HTTP' ): parts = line.split(' ') try: headers[ 'protocol' ] = parts[0] headers[ 'status' ] = parts[1] headers[ 'code' ] = parts[2] except: pass b.destroy() rec.destroy() w.destroy() return ( headers , content ) return ( w , rec )
def indexWarc(indexin, fname, tempdir): f = open(indexin, 'a') w = WFile(fname, CONSTANT, warc.WARC_FILE_READER, warc.WARC_FILE_DETECT_COMPRESSION, tempdir) if not w: return None while (w.hasMoreRecords()): r = w.nextRecord() rect = getRecTypeName(r.getRecordType()) if rect == 'WARC_RESPONSE_RECORD' or rect == 'WARC_RESOURCE_RECORD': headers = getHeaders(w, r) try: if 'status' in headers: status = int(headers['status']) else: status = 200 except: status = 200 if 'content-type' in headers: contenttype = headers['content-type'].split(';')[0] else: contenttype = 'text/plain' # fix this if status > 300 and status < 400 and False: redirect = '' else: redirect = '-' kp = keypath(r.getTargetUri()) if len(kp) < 400: f.write('%s %s %s %u %u %s %s warc\n' % (kp, getTimestampFromWarcDate(r.getDate()), contenttype, 200, r.getOffset(), redirect, fname)) r.destroy() w.destroy() f.close()
def main(): usage = "./app/python/warcfilter -f <file.warc> [-u <uri>] [-m <mime>] [-r <rtype>] [-v] [-t <working_dir>]\n "\ "\t-f : valid WARC file name\n"\ "\t[-u] : regular expression comparison with URI\n"\ "\t[-m] : regular expression comparison with MIME\n"\ "\t[-r] : regular expression comparison with record types (see 'public/wrectype.h' for possible values)\n"\ "\t[-t] : temporary working directory (default './')\n"\ "\t[-v] : dump ANVL (default false)" parser = OptionParser(usage) parser.add_option("-f", "--file", dest="filename", help="read data from FILENAME") parser.add_option("-u", "--uri", dest="uri", help="filter applied to uri field") parser.add_option("-m", "--mime", dest="mime", help="filter applied to mime field") parser.add_option("-r", "--rtype", dest="rtype", help="filter applied to record type field") parser.add_option("-v", "--verbose", action="store_true", dest="verbose") parser.add_option("-t", "--tempdir", dest="tmpdir", help="Temporary working directory", default="./") (options, args) = parser.parse_args() if len(args) != 0: parser.error(" Incorrect arguments") if (not (options.filename)): parser.error(" You must give WARC file name") cpt = 0 rectype = {0 : 'WARC_UNKNOWN_RECORD', 1 : 'WARC_INFO_RECORD', 2 : 'WARC_RESPONSE_RECORD', 3 : 'WARC_REQUEST_RECORD', 4 : 'WARC_METADATA_RECORD', \ 5 : 'WARC_REVISIT_RECORD', 6 : 'WARC_CONVERSION_RECORD', 7 : 'WARC_CONTINUATION_RECORD', 8 : 'WARC_RESOURCE_RECORD'} if (options.uri): cpt = cpt + 1 if (options.mime): cpt = cpt + 1 if (options.rtype): cpt = cpt + 1 if cpt != 1: parser.error( " You must apply filter to one of fields : uri or mime or record type" ) w = WFile(options.filename, 600 * 1024 * 1024, warc.WARC_FILE_READER, warc.WARC_FILE_DETECT_COMPRESSION, options.tmpdir) if (not (w)): print "WARC file not found " return while (w.hasMoreRecords()): r = w.nextRecord() if (not (r)): print "bad WARC file " return m1 = None if (options.uri): string = r.getTargetUri() if (string): m1 = searchFromAnyPosition(options.uri, string) if (options.mime): string = r.getContentType() if (string): m1 = searchFromAnyPosition(options.mime, string) if (options.rtype): rt = warc.WRecord_getRecordType(r) m1 = searchFromAnyPosition(options.rtype, rectype[rt]) if m1: sys.stdout.write("%-20u " % r.getOffset()) sys.stdout.write("%-20u " % r.getCompressedSize()) sys.stdout.write("%-10s " % r.getWarcId()) sys.stdout.write("%-20u " % r.getContentLength()) sys.stdout.write("%-45u " % r.getRecordType()) sys.stdout.write("%-44s " % r.getDate()) sys.stdout.write("%-86s " % r.getRecordId()) m1 = warc.WARC_FALSE print "More Fields:\n" if (r.getContentType()): print "%-35s: %-20s" % ("Content-Type", r.getContentType()) m1 = warc.WARC_TRUE if (r.getConcurrentTo()): print "%-35s: %-20s" % ("WARC-Concurrent-To", r.getConcurrentTo()) m1 = warc.WARC_TRUE if (r.getBlockDigest()): print "%-35s: %-20s" % ("WARC-Block-Digest", r.getBlockDigest()) m1 = warc.WARC_TRUE if (r.getPayloadDigest()): print "%-35s: %-20s" % ("WARC-Payload-Digest", r.getPayloadDigest()) m1 = warc.WARC_TRUE if (r.getIpAddress()): print "%-35s: %-20s" % ("WARC-IP-Address", r.getIpAddress()) m1 = warc.WARC_TRUE if (r.getRefersTo()): print "%-35s: %-20s" % ("WARC-Refers-To", r.getRefersTo()) m1 = warc.WARC_TRUE if (r.getTargetUri()): print "%-35s: %-20s" % ("WARC-Target-URI", r.getTargetUri()) m1 = warc.WARC_TRUE if (r.getTruncated()): print "%-35s: %-20s" % ("WARC-Truncated", r.getTruncated()) m1 = warc.WARC_TRUE if (r.getWarcInfoId()): print "%-35s: %-20s" % ("WARC-Warcinfo-ID", r.getWarcInfoId()) m1 = warc.WARC_TRUE if (r.getFileName()): print "%-35s: %-20s" % ("WARC-Filename:", r.getFileName()) m1 = warc.WARC_TRUE if (r.getProfile()): print "%-35s: %-20s" % ("WARC-Profile", r.getProfile()) m1 = warc.WARC_TRUE if (r.getPayloadType()): print "%-35s: %-20s" % ("WARC-Identified-Payload-type", r.getPayloadType()) m1 = warc.WARC_TRUE if (r.getSegmentOriginId()): print "%-35s: %-20s" % ("WARC-Segment-Origin-ID", r.getSegmentOriginId()) m1 = warc.WARC_TRUE if (r.getSegmentNumber()): print "%-35s: %-20d" % ("WARC-Segment-Number", r.getSegmentNumber()) m1 = warc.WARC_TRUE if (r.getSegTotalLength()): print "%-35s: %-20d" % ("WARC-Segment-Total-Length", r.getSegTotalLength()) m1 = warc.WARC_TRUE if (not (m1)): print "--No One --" if (options.verbose): nb = r.getAnvlFieldsNumber() if nb != 0: i = 0 print "-- More Info--\n" while (i < nb): print "key : ", r.getAnvlFieldKey(i) print "Value: ", r.getAnvlFieldValue(i) i = i + 1 r.destroy() w.destroy() return
def urlInWarc(url, warcFileName, tempdir): w = WFile ( warcFileName, config['max_warc_size'], warc.WARC_FILE_READER, warc.WARC_FILE_DETECT_COMPRESSION, tempdir)
def main(): usage = "./app/python/wgetbloc.py <-f file.warc> <-o offset> [-e] [-v] [-t <working_dir>]\n "\ "\t-f : valid WARC file name\n"\ "\t-o : record offset\n"\ "\t[-e] : print HTTP response headers (default 'no')\n"\ "\t[-t] : temporary working directory (default './')\n"\ "./app/python/wgetbloc.py -f foo.warc.gz -n 7" parser = OptionParser(usage) parser.add_option("-f", "--file", dest="filename", help="read data from FILENAME") parser.add_option("-o", "--offset", dest="offset", help="record offset", type="int") parser.add_option("-e", "--headers", action="store_false", default=True, dest="headers") parser.add_option("-t", "--tempdir", dest="tmpdir", help="Temporary working directory", default=".") (options, args) = parser.parse_args() if len(args) != 0: parser.error(" Incorrect arguments") if (not (options.filename)): parser.error(" You must give WARC file name") if options.offset == None: parser.error(" You must provide a valid record offset") w = WFile(options.filename, 600 * 1024 * 1024, warc.WARC_FILE_READER, warc.WARC_FILE_DETECT_COMPRESSION, options.tmpdir) if (not (w)): print "WARC file not found " # go to the specified offset w.seek(options.offset) if w.hasMoreRecords(): r = w.nextRecord() else: print "End of file reached, or no record at this offset", options.offset sys.exit(0) # choose your buffer size (ex. 64K = 64 * 1024) to read the payload # (with the HTTP headers or not, use the boolean flag) chunk by chunk b = WBloc(w, r, options.headers, 64 * 1024) while True: buff = b.getNext() if buff: # the chunk size is returned by calling "b.getLastChunkSize()" #sys.stderr.write("chunk size:" + b.getLastChunkSize()) sys.stdout.write(buff) else: # no more data to read. reach the end of record break b.destroy() r.destroy() w.destroy()
def main () : usage = "./app/python/warcdump.py -f <file.warc> [-v] [-t <working_dir>]\n "\ "\t-f : valid WARC file name\n"\ "\t[-t] : temporary working directory (default './')\n"\ "\t[-v] : dump ANVL (default false)" parser = OptionParser(usage) parser.add_option("-f", "--file", dest="filename", help="read data from FILENAME") parser.add_option("-v", "--verbose", action="store_true", dest="verbose") parser.add_option("-t", "--tempdir", dest="tmpdir", help="Temporary working directory", default=".") (options, args) = parser.parse_args() if len (args) != 0 : parser.error("Incorrect arguments") if (not (options.filename)) : parser.error("You must provide a WARC file name") w = WFile (options.filename, 600 * 1024 * 1024, warc.WARC_FILE_READER, warc.WARC_FILE_DETECT_COMPRESSION, options.tmpdir) if w == None: print "Couldn't create a WARC File object" return print "%-20s %-20s %-10s %-20s %-45s %-44s %-86s " % ("Offset", "CSize", "WarcId", "Content-Length", "WARC-Type", "WARC-Date", "WARC-Record-ID") while ( w.hasMoreRecords() ) : r = w.nextRecord() if r == None: w.destroy () print "Couldn't get the WARC record object" return sys.stdout.write ("%-20u " % r . getOffset () ) sys.stdout.write ("%-20u " % r . getCompressedSize () ) sys.stdout.write ("%-10s " % r . getWarcId () ) sys.stdout.write ("%-20u " % r . getContentLength () ) sys.stdout.write ("%-45u " % r . getRecordType () ) sys.stdout.write ("%-44s " % r . getDate () ) sys.stdout.write ("%-86s " % r . getRecordId () ) ######### m1 = warc.WARC_FALSE ######### if (r . getContentType () ) : print "%-35s: %-20s" % ("Content-Type" , r . getContentType () ) m1 = warc.WARC_TRUE if (r . getConcurrentTo ()) : print "%-35s: %-20s" % ("WARC-Concurrent-To" , r . getConcurrentTo ()) m1 = warc.WARC_TRUE if (r . getBlockDigest ()) : print "%-35s: %-20s" % ( "WARC-Block-Digest", r . getBlockDigest ()) m1 = warc.WARC_TRUE if (r . getPayloadDigest ()) : print "%-35s: %-20s" % ("WARC-Payload-Digest", r . getPayloadDigest ()) m1 = warc.WARC_TRUE if (r . getIpAddress ()) : print "%-35s: %-20s" % ("WARC-IP-Address", r . getIpAddress ()) m1 = warc.WARC_TRUE if ( r . getRefersTo ()) : print "%-35s: %-20s" % ("WARC-Refers-To", r . getRefersTo ()) m1 = warc.WARC_TRUE if (r . getTargetUri ()) : print "%-35s: %-20s" % ( "WARC-Target-URI",r . getTargetUri ()) m1 = warc.WARC_TRUE if (r . getTruncated ()) : print "%-35s: %-20s" % ("WARC-Truncated", r . getTruncated ()) m1 = warc.WARC_TRUE if ( r . getWarcInfoId ()) : print "%-35s: %-20s" % ("WARC-Warcinfo-ID", r . getWarcInfoId ()) m1 = warc.WARC_TRUE if (r . getFileName ()) : print "%-35s: %-20s" % ("WARC-Filename:", r . getFileName ()) m1 = warc.WARC_TRUE if (r . getProfile ()) : print "%-35s: %-20s" % ("WARC-Profile", r . getProfile ()) m1 = warc.WARC_TRUE if (r . getPayloadType ()) : print "%-35s: %-20s" % ("WARC-Identified-Payload-type", r . getPayloadType ()) m1 = warc.WARC_TRUE if (r . getSegmentOriginId ()) : print "%-35s: %-20s" % ("WARC-Segment-Origin-ID", r . getSegmentOriginId ()) m1 = warc.WARC_TRUE if (r . getSegmentNumber ()) : print "%-35s: %-20d" % ("WARC-Segment-Number", r . getSegmentNumber ()) m1 = warc.WARC_TRUE if (r . getSegTotalLength ()) : print "%-35s: %-20d" % ("WARC-Segment-Total-Length", r . getSegTotalLength()) m1 = warc.WARC_TRUE if (not (m1)) : print "--No One --" if (options.verbose) : nb = r . getAnvlFieldsNumber () if nb != 0 : i = 0 print "-- More Info--\n" while ( i < nb ) : print "key : ", r . getAnvlFieldKey (i) print "Value: ", r . getAnvlFieldValue (i) i = i + 1 r.destroy() w.destroy () return
def main () : usage = "./app/python/wgetbloc.py <-f file.warc> <-o offset> [-e] [-v] [-t <working_dir>]\n "\ "\t-f : valid WARC file name\n"\ "\t-o : record offset\n"\ "\t[-e] : print HTTP response headers (default 'no')\n"\ "\t[-t] : temporary working directory (default './')\n"\ "./app/python/wgetbloc.py -f foo.warc.gz -n 7" parser = OptionParser(usage) parser.add_option("-f", "--file", dest="filename", help="read data from FILENAME") parser.add_option("-o", "--offset", dest="offset", help="record offset", type="int") parser.add_option("-e", "--headers", action="store_false", default=True, dest="headers") parser.add_option("-t", "--tempdir", dest="tmpdir", help="Temporary working directory", default=".") (options, args) = parser.parse_args() if len (args) != 0 : parser.error(" Incorrect arguments") if (not (options.filename)) : parser.error(" You must give WARC file name") if options.offset == None: parser.error(" You must provide a valid record offset") w = WFile (options.filename, 600 * 1024 * 1024, warc.WARC_FILE_READER, warc.WARC_FILE_DETECT_COMPRESSION, options.tmpdir) if (not (w)) : print "WARC file not found " # go to the specified offset w.seek(options.offset); if w . hasMoreRecords (): r = w . nextRecord () else: print "End of file reached, or no record at this offset", options.offset sys.exit(0); # choose your buffer size (ex. 64K = 64 * 1024) to read the payload # (with the HTTP headers or not, use the boolean flag) chunk by chunk b = WBloc (w, r, options.headers, 64 * 1024) while True: buff = b.getNext() if buff: # the chunk size is returned by calling "b.getLastChunkSize()" #sys.stderr.write("chunk size:" + b.getLastChunkSize()) sys.stdout.write(buff) else: # no more data to read. reach the end of record break b.destroy () r.destroy () w.destroy ()
def main(): global debug global tmpdir global tmpfile usage = " Download list of URLs and store them in WARC container \n\n"\ + sys.argv[0] + " -f <urls_list> -o <warc_file> [-s <tmpfile>] [-t <tmpdir>] [-q]" parser = OptionParser(usage) parser.add_option("-f", "--file", dest="input", type="string", help="file name containing a list of URLs") parser.add_option("-o", "--output", dest="out", type="string", help="WARC output file name to store documents") parser.add_option("-s", "--tmpfile", dest="tmpfile", type="string", help="temporary file (default \".tmpfile\")") parser.add_option("-t", "--tmpdir", dest="tmpdir", type="string", help="temporary directory (default \".\")") parser.add_option("-q", "--quiet", dest="quiet", action="store_true", help="quiet, no debug") (options, args) = parser.parse_args() if (options.tmpfile): tmpfile = options.tmpfile if (options.quiet): debug = False if (options.input): urls = open(options.input).readlines() else: urls = sys.stdin.readlines() if (options.tmpdir): tmpdir = options.tmpdir if (options.out == None): logger("-o <warc_file> option is mandatory\n") sys.exit(1) # open the WARC file for writing cmode = warc.WARC_FILE_COMPRESSED_GZIP maxsize = 600 * 1024 * 1024 w = WFile(options.out, maxsize, warc.WARC_FILE_WRITER, cmode, tmpdir) # Make a queue with (url, filename) tuples for url in urls: url = url.strip() if not url or url[0] == "#": continue # fetch the RL effective_url, mime, date, ip, status, redirect = fetcher(url) addToWarc(w, effective_url, mime, date, ip) logger("%-30s %-3d %-3d %-30s\n" % (url, status, redirect, effective_url)) os.remove(tmpfile) w.destroy()
def main(): usage = "./app/python/warcdump.py -f <file.warc> [-v] [-t <working_dir>]\n "\ "\t-f : valid WARC file name\n"\ "\t[-t] : temporary working directory (default './')\n"\ "\t[-v] : dump ANVL (default false)" parser = OptionParser(usage) parser.add_option("-f", "--file", dest="filename", help="read data from FILENAME") parser.add_option("-v", "--verbose", action="store_true", dest="verbose") parser.add_option("-t", "--tempdir", dest="tmpdir", help="Temporary working directory", default=".") (options, args) = parser.parse_args() if len(args) != 0: parser.error("Incorrect arguments") if (not (options.filename)): parser.error("You must provide a WARC file name") w = WFile(options.filename, 600 * 1024 * 1024, warc.WARC_FILE_READER, warc.WARC_FILE_DETECT_COMPRESSION, options.tmpdir) if w == None: print "Couldn't create a WARC File object" return print "%-20s %-20s %-10s %-20s %-45s %-44s %-86s " % ( "Offset", "CSize", "WarcId", "Content-Length", "WARC-Type", "WARC-Date", "WARC-Record-ID") while (w.hasMoreRecords()): r = w.nextRecord() if r == None: w.destroy() print "Couldn't get the WARC record object" return sys.stdout.write("%-20u " % r.getOffset()) sys.stdout.write("%-20u " % r.getCompressedSize()) sys.stdout.write("%-10s " % r.getWarcId()) sys.stdout.write("%-20u " % r.getContentLength()) sys.stdout.write("%-45u " % r.getRecordType()) sys.stdout.write("%-44s " % r.getDate()) sys.stdout.write("%-86s " % r.getRecordId()) ######### m1 = warc.WARC_FALSE ######### if (r.getContentType()): print "%-35s: %-20s" % ("Content-Type", r.getContentType()) m1 = warc.WARC_TRUE if (r.getConcurrentTo()): print "%-35s: %-20s" % ("WARC-Concurrent-To", r.getConcurrentTo()) m1 = warc.WARC_TRUE if (r.getBlockDigest()): print "%-35s: %-20s" % ("WARC-Block-Digest", r.getBlockDigest()) m1 = warc.WARC_TRUE if (r.getPayloadDigest()): print "%-35s: %-20s" % ("WARC-Payload-Digest", r.getPayloadDigest()) m1 = warc.WARC_TRUE if (r.getIpAddress()): print "%-35s: %-20s" % ("WARC-IP-Address", r.getIpAddress()) m1 = warc.WARC_TRUE if (r.getRefersTo()): print "%-35s: %-20s" % ("WARC-Refers-To", r.getRefersTo()) m1 = warc.WARC_TRUE if (r.getTargetUri()): print "%-35s: %-20s" % ("WARC-Target-URI", r.getTargetUri()) m1 = warc.WARC_TRUE if (r.getTruncated()): print "%-35s: %-20s" % ("WARC-Truncated", r.getTruncated()) m1 = warc.WARC_TRUE if (r.getWarcInfoId()): print "%-35s: %-20s" % ("WARC-Warcinfo-ID", r.getWarcInfoId()) m1 = warc.WARC_TRUE if (r.getFileName()): print "%-35s: %-20s" % ("WARC-Filename:", r.getFileName()) m1 = warc.WARC_TRUE if (r.getProfile()): print "%-35s: %-20s" % ("WARC-Profile", r.getProfile()) m1 = warc.WARC_TRUE if (r.getPayloadType()): print "%-35s: %-20s" % ("WARC-Identified-Payload-type", r.getPayloadType()) m1 = warc.WARC_TRUE if (r.getSegmentOriginId()): print "%-35s: %-20s" % ("WARC-Segment-Origin-ID", r.getSegmentOriginId()) m1 = warc.WARC_TRUE if (r.getSegmentNumber()): print "%-35s: %-20d" % ("WARC-Segment-Number", r.getSegmentNumber()) m1 = warc.WARC_TRUE if (r.getSegTotalLength()): print "%-35s: %-20d" % ("WARC-Segment-Total-Length", r.getSegTotalLength()) m1 = warc.WARC_TRUE if (not (m1)): print "--No One --" if (options.verbose): nb = r.getAnvlFieldsNumber() if nb != 0: i = 0 print "-- More Info--\n" while (i < nb): print "key : ", r.getAnvlFieldKey(i) print "Value: ", r.getAnvlFieldValue(i) i = i + 1 r.destroy() w.destroy() return
def convert(fname, outfname, tmpdir, cmode): a = AFile(fname, arc.ARC_FILE_DETECT_COMPRESSION, tmpdir) if (not (a)): print "ARC file not found " return if (cmode): cmode = warc.WARC_FILE_COMPRESSED_GZIP else: cmode = warc.WARC_FILE_UNCOMPRESSED w = WFile(outfname, 16 * 1024 * 1024 * 1024, warc.WARC_FILE_WRITER, cmode, tmpdir) if w == None: print "given temporary directory does not exist " a.destroy() return while (a.hasMoreRecords()): ar = a.nextRecord() if ar == None: print "bad ARC file" a.destroy() w.destroy() return wr = WRecord() if wr == None: print "can not create WARC record object" a.destroy() w.destroy() ar.destroy() return wr.setRecordType(warc.WARC_RESPONSE_RECORD) uri = ar.getUrl() wr.setTargetUri(uri, len(uri)) date = ar.getCreationDate() wr.setDateFromArc(date, len(date)) mime = ar.getMimeType() wr.setContentType(mime, len(mime)) ip = ar.getIpAddress() wr.setIpAddress(ip, len(ip)) s = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.localtime()) sh = sha.new(uri + s) rid = sh.hexdigest() rid = "uuid:" + rid wr.setRecordId(rid, len(rid)) if (ar.transferContent(wr, a)): print "Unable to pass content to the WRecord" a.destroy() w.destroy() ar.destroy() return if (w.storeRecord(wr)): print "failed to write WRecord" a.destroy() w.destroy() ar.destroy() return ar.destroy() wr.destroy() a.destroy() w.destroy()