Esempio n. 1
0
def addToWarc (fname, wfile, uri, mime, date, ip, cmode, maxsize, tmpdir):
    ##  creating a new record  ##
    ##  don't forget to check return values of each functions  ##

        w = WFile(wfile, maxsize, warc.WARC_FILE_WRITER, cmode, tmpdir)
        if w == None:
                print "Couldn't create a WARC File object"
                return
    
	r = WRecord()
        if r == None:
                w.destroy ()
                print "Couldn't create an empty WARC record object"
                return   

	r . setRecordType(warc.WARC_RESOURCE_RECORD)
       	r . setTargetUri(uri, len(uri))	
       	r . setDate(date, len(date))
       	r . setContentType(mime, len(mime))

        # use your "unique identifier" function here
        #s = time.strftime ("%Y-%m-%dT%H:%M:%SZ", time.localtime())
        s = "%Y-%m-%dT%H:%M:%SZ"
	sh = sha.new(uri + s)
	rid = sh.hexdigest()
	rid = "uuid:" + rid
	r . setRecordId(rid, len(rid))
	r . setIpAddress(ip, len(ip))

	r . setContentFromFileName(fname)

	w . storeRecord(r)
	r . destroy()

	w . destroy()
Esempio n. 2
0
def main():
    global debug
    global tmpdir
    global tmpfile

    usage = (
        " Download list of URLs and store them in WARC container \n\n"
        + sys.argv[0]
        + " -f <urls_list> -o <warc_file> [-s <tmpfile>] [-t <tmpdir>] [-q]"
    )

    parser = OptionParser(usage)
    parser.add_option("-f", "--file", dest="input", type="string", help="file name containing a list of URLs")
    parser.add_option("-o", "--output", dest="out", type="string", help="WARC output file name to store documents")
    parser.add_option("-s", "--tmpfile", dest="tmpfile", type="string", help='temporary file (default ".tmpfile")')
    parser.add_option("-t", "--tmpdir", dest="tmpdir", type="string", help='temporary directory (default ".")')
    parser.add_option("-q", "--quiet", dest="quiet", action="store_true", help="quiet, no debug")

    (options, args) = parser.parse_args()

    if options.tmpfile:
        tmpfile = options.tmpfile

    if options.quiet:
        debug = False

    if options.input:
        urls = open(options.input).readlines()
    else:
        urls = sys.stdin.readlines()

    if options.tmpdir:
        tmpdir = options.tmpdir

    if options.out == None:
        logger("-o <warc_file> option is mandatory\n")
        sys.exit(1)

    # open the WARC file for writing
    cmode = warc.WARC_FILE_COMPRESSED_GZIP
    maxsize = 600 * 1024 * 1024
    w = WFile(options.out, maxsize, warc.WARC_FILE_WRITER, cmode, tmpdir)

    # Make a queue with (url, filename) tuples
    for url in urls:
        url = url.strip()
        if not url or url[0] == "#":
            continue

        # fetch the RL
        effective_url, mime, date, ip, status, redirect = fetcher(url)

        addToWarc(w, effective_url, mime, date, ip)
        logger("%-30s %-3d  %-3d %-30s\n" % (url, status, redirect, effective_url))

        os.remove(tmpfile)

    w.destroy()
Esempio n. 3
0
def indexWarc(warcFileName):
    tempdir = tempfile.mkdtemp(prefix='opds-crawler-')
    print 'created tempdir ' + tempdir

    w = WFile(warcFileName, config['max_warc_size'], warc.WARC_FILE_READER,
              warc.WARC_FILE_DETECT_COMPRESSION, tempdir)
    assert w

    while (w.hasMoreRecords()):

        r = w.nextRecord()
        if None == r:
            w.destroy()
            print "bad record.. bailing!"
            return

        url = r.getTargetUri()
        print 'processing ' + url
        b = WBloc(w, r, False, 64 * 1024)

        content = ''
        while True:
            buf = b.getNext()
            if buf:
                content += buf
                #sys.stdout.write(buf)
            else:
                break

        if 'application/atom+xml' == r.getContentType():
            ingestor = bookserver.catalog.ingest.OpdsToCatalog(content, url)
            c = ingestor.getCatalog()
            provider = getProvider(url)
            renderer = bookserver.catalog.output.CatalogToSolr(c, provider)
            str = renderer.toString()

            solr_import_xml = tempdir + "/solr_import.xml"
            f = open(solr_import_xml, 'w')
            f.write(str)
            f.close()

            command = """/solr/example/exampledocs/post.sh '%s'""" % (
                solr_import_xml)

            (ret, out) = commands.getstatusoutput(command)
            if -1 == out.find('<int name="status">0</int>'):
                print out
            assert 0 == ret

            os.unlink(solr_import_xml)

        b.destroy()
        r.destroy()

    os.rmdir(tempdir)
    w.destroy()
Esempio n. 4
0
def indexWarc(warcFileName):
    tempdir = tempfile.mkdtemp(prefix='opds-crawler-')
    print 'created tempdir ' + tempdir

    w = WFile (warcFileName, config['max_warc_size'], warc.WARC_FILE_READER, warc.WARC_FILE_DETECT_COMPRESSION, tempdir)
    assert w

    while ( w.hasMoreRecords() ) :

        r = w.nextRecord()
        if None == r:
            w.destroy ()
            print "bad record.. bailing!"
            return
        
        url = r.getTargetUri()
        print 'processing ' + url
        b = WBloc (w, r, False, 64 * 1024)
        
        content = ''
        while True:
            buf = b.getNext()
            if buf:
                content += buf
                #sys.stdout.write(buf)
            else:
                break

        if 'application/atom+xml' == r.getContentType():
            ingestor = bookserver.catalog.ingest.OpdsToCatalog(content, url)
            c = ingestor.getCatalog()
            provider = getProvider(url)
            renderer = bookserver.catalog.output.CatalogToSolr(c, provider)
            str = renderer.toString()
            
            solr_import_xml = tempdir + "/solr_import.xml"
            f = open(solr_import_xml, 'w')
            f.write(str)
            f.close()
                        
            command = """/solr/example/exampledocs/post.sh '%s'""" % (solr_import_xml)
            
            (ret, out) = commands.getstatusoutput(command)
            if -1 == out.find('<int name="status">0</int>'):
                print out
            assert 0 == ret

            os.unlink(solr_import_xml)
            

        b.destroy()
        r.destroy()
        
    os.rmdir(tempdir)
    w.destroy()
Esempio n. 5
0
def indexWarc( indexin ,  fname , tempdir ):   
    f = open( indexin , 'a' )
    w = WFile ( fname  ,  CONSTANT , warc.WARC_FILE_READER, warc.WARC_FILE_DETECT_COMPRESSION, tempdir )
    if not w :
        return None 
    while ( w.hasMoreRecords() ) :
        r = w.nextRecord()
        rect =  getRecTypeName( r.getRecordType() )

        if rect == 'WARC_RESPONSE_RECORD' or rect == 'WARC_RESOURCE_RECORD':
            headers = getHeaders( w , r )           
            try:
                if 'status' in headers:
                    status = int( headers['status'] )
                else:
                    status = 200
            except:
                status = 200
            if 'content-type' in headers:
                contenttype= headers['content-type'].split(';')[0]
            else:
                contenttype='text/plain'
            # fix this     
            if status >300 and status < 400 and False: 
                redirect = ''
            else:
                redirect = '-'
            kp =  keypath( r.getTargetUri() )
            if len( kp ) < 400:
                f.write( '%s %s %s %u %u %s %s warc\n' % ( kp , 
                                                                getTimestampFromWarcDate( r.getDate() ) , 
                                                                contenttype,
                                                                200,
                                                                r.getOffset() , 
                                                                redirect ,
                                                                fname ))                                                                
        r.destroy()
    w.destroy( )          
    f.close()
Esempio n. 6
0
def createNewWarc(domain, domain_warc_dir, tempdir, crawlDateTime):
    
    #Name the warc file based on the domain and the date of the last update date
    #Since this is a new warc, we will use 01-01-1970 as update date. It will
    #get renamed with the crawl is finished to whatever is the lastest update
    #date in the feed
    
    ### We are now creating a new warc everytime, instead of adding to old ones
    #warcDateTime = datetime.datetime(1970, 1, 1, 0, 0, 0)
    warcDateTime = crawlDateTime

    warcFileName = '%s/%s_%s_warc.gz' % (domain_warc_dir, domain, warcDateTime.isoformat())
    print 'creating new warc file ' + warcFileName
    
    cmode = warc.WARC_FILE_COMPRESSED_GZIP
    
    w = WFile(warcFileName, config['max_warc_size'], warc.WARC_FILE_WRITER, cmode, tempdir)
    return w, warcFileName, warcDateTime
Esempio n. 7
0
def getLatestWarc(domain_warc_dir, tempdir):

    warcs = sorted(glob.glob(domain_warc_dir + '/*_warc.gz'))
    if [] == warcs:
        print 'No warc file found in ' + domain_warc_dir
        return None, None, None
    else:
        warcFileName = warcs[-1]
        m = re.match(r"(\S+)_(\S+)_warc.gz", warcFileName)
        assert None != m
        
        isodate = xml.utils.iso8601.parse(m.group(2) + '+00:00')
        warcDateTime = datetime.datetime.utcfromtimestamp(isodate)
        
        cmode = warc.WARC_FILE_COMPRESSED_GZIP
    
        w = WFile(warcFileName, config['max_warc_size'], warc.WARC_FILE_WRITER, cmode, tempdir)
        
        return w, warcFileName, warcDateTime        
Esempio n. 8
0
def getRecord(warcname, offset, tempdir='.'):
    w = WFile(warcname, CONSTANT, warc.WARC_FILE_READER,
              warc.WARC_FILE_DETECT_COMPRESSION, tempdir)
    w.seek(int(offset))
    rec = w.nextRecord()
    b = WBloc(w, rec, False, BLOCKSIZE)
    dat = ''
    while True:
        nd = b.getNext()
        if nd:
            dat += nd
        else:
            break
    headers = {}
    if len(dat) > 0:
        bits = dat.split('\r\n\r\n')
        content = "\r\n\r\n".join(bits[1:])
        header = bits[0].split('\r\n')
        for line in header:
            line = line.strip()
            if len(line.strip()) == 0:
                break
            else:
                if ':' in line:
                    bits = line.split(':')
                    headers[bits[0]] = ''.join(bits[1:])
                elif line.startswith('HTTP'):
                    parts = line.split(' ')
                    try:
                        headers['protocol'] = parts[0]
                        headers['status'] = parts[1]
                        headers['code'] = parts[2]
                    except:
                        pass
    b.destroy()
    rec.destroy()
    w.destroy()
    return (headers, content)

    return (w, rec)
Esempio n. 9
0
def getRecord( warcname , offset , tempdir ='.' ):
    w = WFile ( warcname  ,  CONSTANT , warc.WARC_FILE_READER, warc.WARC_FILE_DETECT_COMPRESSION, tempdir )
    w.seek( int(offset ))
    rec = w.nextRecord()
    b = WBloc( w , rec , False , BLOCKSIZE )
    dat = ''
    while True:
        nd = b.getNext()
        if nd:
            dat += nd
        else:            
            break       
    headers = {}                            
    if len(dat) > 0 :
        bits = dat.split( '\r\n\r\n' )
        content = "\r\n\r\n".join( bits[1:] )       
        header = bits[0].split('\r\n' )
        for line in header:
            line = line.strip()
            if len( line.strip() ) == 0:
                break
            else:
                if ':' in line:
                    bits = line.split(':')
                    headers[bits[0] ] = ''.join( bits[1:] )
                elif line.startswith('HTTP' ):
                    parts = line.split(' ')
                    try:
                        headers[ 'protocol' ] = parts[0]
                        headers[ 'status' ] = parts[1]
                        headers[ 'code' ] = parts[2]
                    except:
                        pass        
    b.destroy() 
    rec.destroy()
    w.destroy()
    return ( headers , content )
    
    
    return ( w , rec )
Esempio n. 10
0
def indexWarc(indexin, fname, tempdir):
    f = open(indexin, 'a')
    w = WFile(fname, CONSTANT, warc.WARC_FILE_READER,
              warc.WARC_FILE_DETECT_COMPRESSION, tempdir)
    if not w:
        return None
    while (w.hasMoreRecords()):
        r = w.nextRecord()
        rect = getRecTypeName(r.getRecordType())

        if rect == 'WARC_RESPONSE_RECORD' or rect == 'WARC_RESOURCE_RECORD':
            headers = getHeaders(w, r)
            try:
                if 'status' in headers:
                    status = int(headers['status'])
                else:
                    status = 200
            except:
                status = 200
            if 'content-type' in headers:
                contenttype = headers['content-type'].split(';')[0]
            else:
                contenttype = 'text/plain'
            # fix this
            if status > 300 and status < 400 and False:
                redirect = ''
            else:
                redirect = '-'
            kp = keypath(r.getTargetUri())
            if len(kp) < 400:
                f.write('%s %s %s %u %u %s %s warc\n' %
                        (kp, getTimestampFromWarcDate(r.getDate()),
                         contenttype, 200, r.getOffset(), redirect, fname))
        r.destroy()
    w.destroy()
    f.close()
Esempio n. 11
0
def main():

    usage =  "./app/python/warcfilter -f <file.warc> [-u <uri>] [-m <mime>] [-r <rtype>] [-v] [-t <working_dir>]\n "\
             "\t-f    : valid WARC file name\n"\
             "\t[-u]  : regular expression comparison with URI\n"\
             "\t[-m]  : regular expression comparison with MIME\n"\
             "\t[-r]  : regular expression comparison with record types (see 'public/wrectype.h' for possible values)\n"\
             "\t[-t]  : temporary working directory (default './')\n"\
             "\t[-v]  : dump ANVL (default false)"

    parser = OptionParser(usage)

    parser.add_option("-f",
                      "--file",
                      dest="filename",
                      help="read data from FILENAME")

    parser.add_option("-u",
                      "--uri",
                      dest="uri",
                      help="filter applied to uri field")

    parser.add_option("-m",
                      "--mime",
                      dest="mime",
                      help="filter applied to mime field")

    parser.add_option("-r",
                      "--rtype",
                      dest="rtype",
                      help="filter applied to record type field")

    parser.add_option("-v", "--verbose", action="store_true", dest="verbose")

    parser.add_option("-t",
                      "--tempdir",
                      dest="tmpdir",
                      help="Temporary working directory",
                      default="./")

    (options, args) = parser.parse_args()

    if len(args) != 0:
        parser.error(" Incorrect arguments")

    if (not (options.filename)):
        parser.error(" You must give WARC file name")

    cpt = 0

    rectype = {0 : 'WARC_UNKNOWN_RECORD', 1 : 'WARC_INFO_RECORD', 2 : 'WARC_RESPONSE_RECORD', 3 :  'WARC_REQUEST_RECORD',  4 :  'WARC_METADATA_RECORD', \
               5 : 'WARC_REVISIT_RECORD', 6 :  'WARC_CONVERSION_RECORD', 7 : 'WARC_CONTINUATION_RECORD', 8 : 'WARC_RESOURCE_RECORD'}

    if (options.uri):
        cpt = cpt + 1

    if (options.mime):
        cpt = cpt + 1

    if (options.rtype):
        cpt = cpt + 1

    if cpt != 1:
        parser.error(
            " You must apply filter to one of fields : uri or mime or record type"
        )

    w = WFile(options.filename, 600 * 1024 * 1024, warc.WARC_FILE_READER,
              warc.WARC_FILE_DETECT_COMPRESSION, options.tmpdir)

    if (not (w)):
        print "WARC file  not found  "
        return

    while (w.hasMoreRecords()):

        r = w.nextRecord()

        if (not (r)):
            print "bad WARC file "
            return

        m1 = None

        if (options.uri):

            string = r.getTargetUri()

            if (string):

                m1 = searchFromAnyPosition(options.uri, string)

        if (options.mime):

            string = r.getContentType()

            if (string):

                m1 = searchFromAnyPosition(options.mime, string)

        if (options.rtype):

            rt = warc.WRecord_getRecordType(r)
            m1 = searchFromAnyPosition(options.rtype, rectype[rt])

        if m1:

            sys.stdout.write("%-20u " % r.getOffset())

            sys.stdout.write("%-20u " % r.getCompressedSize())

            sys.stdout.write("%-10s " % r.getWarcId())

            sys.stdout.write("%-20u " % r.getContentLength())

            sys.stdout.write("%-45u " % r.getRecordType())

            sys.stdout.write("%-44s " % r.getDate())

            sys.stdout.write("%-86s " % r.getRecordId())

            m1 = warc.WARC_FALSE
            print "More Fields:\n"

            if (r.getContentType()):
                print "%-35s: %-20s" % ("Content-Type", r.getContentType())
                m1 = warc.WARC_TRUE

            if (r.getConcurrentTo()):
                print "%-35s: %-20s" % ("WARC-Concurrent-To",
                                        r.getConcurrentTo())
                m1 = warc.WARC_TRUE

            if (r.getBlockDigest()):
                print "%-35s: %-20s" % ("WARC-Block-Digest",
                                        r.getBlockDigest())
                m1 = warc.WARC_TRUE

            if (r.getPayloadDigest()):
                print "%-35s: %-20s" % ("WARC-Payload-Digest",
                                        r.getPayloadDigest())
                m1 = warc.WARC_TRUE

            if (r.getIpAddress()):
                print "%-35s: %-20s" % ("WARC-IP-Address", r.getIpAddress())
                m1 = warc.WARC_TRUE

            if (r.getRefersTo()):
                print "%-35s: %-20s" % ("WARC-Refers-To", r.getRefersTo())
                m1 = warc.WARC_TRUE

            if (r.getTargetUri()):
                print "%-35s: %-20s" % ("WARC-Target-URI", r.getTargetUri())
                m1 = warc.WARC_TRUE

            if (r.getTruncated()):
                print "%-35s: %-20s" % ("WARC-Truncated", r.getTruncated())
                m1 = warc.WARC_TRUE

            if (r.getWarcInfoId()):
                print "%-35s: %-20s" % ("WARC-Warcinfo-ID", r.getWarcInfoId())
                m1 = warc.WARC_TRUE

            if (r.getFileName()):
                print "%-35s: %-20s" % ("WARC-Filename:", r.getFileName())
                m1 = warc.WARC_TRUE

            if (r.getProfile()):
                print "%-35s: %-20s" % ("WARC-Profile", r.getProfile())
                m1 = warc.WARC_TRUE

            if (r.getPayloadType()):
                print "%-35s: %-20s" % ("WARC-Identified-Payload-type",
                                        r.getPayloadType())
                m1 = warc.WARC_TRUE

            if (r.getSegmentOriginId()):
                print "%-35s: %-20s" % ("WARC-Segment-Origin-ID",
                                        r.getSegmentOriginId())
                m1 = warc.WARC_TRUE

            if (r.getSegmentNumber()):
                print "%-35s: %-20d" % ("WARC-Segment-Number",
                                        r.getSegmentNumber())
                m1 = warc.WARC_TRUE

            if (r.getSegTotalLength()):
                print "%-35s: %-20d" % ("WARC-Segment-Total-Length",
                                        r.getSegTotalLength())
                m1 = warc.WARC_TRUE

            if (not (m1)):
                print "--No One --"

            if (options.verbose):

                nb = r.getAnvlFieldsNumber()
                if nb != 0:
                    i = 0
                    print "-- More Info--\n"
                    while (i < nb):
                        print "key :  ", r.getAnvlFieldKey(i)
                        print "Value: ", r.getAnvlFieldValue(i)
                        i = i + 1

        r.destroy()

    w.destroy()
    return
Esempio n. 12
0
def urlInWarc(url, warcFileName, tempdir):
    w = WFile ( warcFileName, config['max_warc_size'], warc.WARC_FILE_READER, warc.WARC_FILE_DETECT_COMPRESSION, tempdir)
Esempio n. 13
0
def main():

    usage =  "./app/python/wgetbloc.py <-f file.warc> <-o offset> [-e] [-v] [-t <working_dir>]\n "\
             "\t-f    : valid WARC file name\n"\
             "\t-o    : record offset\n"\
             "\t[-e]  : print HTTP response headers (default 'no')\n"\
             "\t[-t]  : temporary working directory (default './')\n"\
             "./app/python/wgetbloc.py -f foo.warc.gz -n 7"

    parser = OptionParser(usage)

    parser.add_option("-f",
                      "--file",
                      dest="filename",
                      help="read data from FILENAME")

    parser.add_option("-o",
                      "--offset",
                      dest="offset",
                      help="record offset",
                      type="int")

    parser.add_option("-e",
                      "--headers",
                      action="store_false",
                      default=True,
                      dest="headers")

    parser.add_option("-t",
                      "--tempdir",
                      dest="tmpdir",
                      help="Temporary working directory",
                      default=".")

    (options, args) = parser.parse_args()

    if len(args) != 0:
        parser.error(" Incorrect arguments")

    if (not (options.filename)):
        parser.error(" You must give WARC file name")

    if options.offset == None:
        parser.error(" You must provide a valid record offset")

    w = WFile(options.filename, 600 * 1024 * 1024, warc.WARC_FILE_READER,
              warc.WARC_FILE_DETECT_COMPRESSION, options.tmpdir)

    if (not (w)):
        print "WARC file  not found "

    # go to the specified offset
    w.seek(options.offset)
    if w.hasMoreRecords():
        r = w.nextRecord()
    else:
        print "End of file reached, or no record at this offset", options.offset
        sys.exit(0)

    # choose your buffer size (ex. 64K = 64 * 1024) to read the payload
    # (with the HTTP headers or not, use the boolean flag) chunk by chunk
    b = WBloc(w, r, options.headers, 64 * 1024)
    while True:
        buff = b.getNext()
        if buff:
            # the chunk size is returned by calling "b.getLastChunkSize()"
            #sys.stderr.write("chunk size:" + b.getLastChunkSize())
            sys.stdout.write(buff)
        else:  # no more data to read. reach the end of record
            break

    b.destroy()
    r.destroy()
    w.destroy()
Esempio n. 14
0
def main () :
    
    usage =  "./app/python/warcdump.py -f <file.warc> [-v] [-t <working_dir>]\n "\
             "\t-f    : valid WARC file name\n"\
             "\t[-t]  : temporary working directory (default './')\n"\
             "\t[-v]  : dump ANVL (default false)"
 
    parser = OptionParser(usage)

    parser.add_option("-f", "--file", dest="filename",
                      help="read data from FILENAME")

    parser.add_option("-v", "--verbose",
                      action="store_true", dest="verbose")
    
    parser.add_option("-t", "--tempdir", dest="tmpdir",
                      help="Temporary working directory", default=".")

    (options, args) = parser.parse_args()

    if len (args) != 0 :
       parser.error("Incorrect arguments")

    if (not (options.filename)) :
        parser.error("You must provide a WARC file name")

    w = WFile (options.filename, 600 * 1024 * 1024, warc.WARC_FILE_READER, warc.WARC_FILE_DETECT_COMPRESSION, options.tmpdir)
    
    if w == None:
        print "Couldn't create a WARC File object"
        return

    print "%-20s %-20s %-10s %-20s %-45s %-44s %-86s " % ("Offset", "CSize", "WarcId", "Content-Length", "WARC-Type", "WARC-Date",  "WARC-Record-ID")

    while ( w.hasMoreRecords() ) :

          r = w.nextRecord()
          if r == None:
             w.destroy ()
             print "Couldn't get the WARC record object"
             return   

          sys.stdout.write ("%-20u " %  r . getOffset () )

          sys.stdout.write ("%-20u " %  r . getCompressedSize () )

          sys.stdout.write ("%-10s " %  r . getWarcId () )

          sys.stdout.write ("%-20u " %  r . getContentLength () ) 

          sys.stdout.write ("%-45u " %  r . getRecordType () ) 

          sys.stdout.write ("%-44s " %  r . getDate () )

          sys.stdout.write ("%-86s " %  r . getRecordId () )      
          #########
          m1 = warc.WARC_FALSE
          #########
          
          if (r . getContentType () ) :
              print "%-35s: %-20s" %  ("Content-Type" , r . getContentType () )
              m1 = warc.WARC_TRUE 

          if (r . getConcurrentTo ()) :
              print "%-35s: %-20s" % ("WARC-Concurrent-To" , r . getConcurrentTo ())
              m1 = warc.WARC_TRUE

          if (r . getBlockDigest ()) :
              print "%-35s: %-20s" % ( "WARC-Block-Digest", r . getBlockDigest ())
              m1 = warc.WARC_TRUE

          if (r . getPayloadDigest ()) :
              print "%-35s: %-20s"  % ("WARC-Payload-Digest", r . getPayloadDigest ())
              m1 = warc.WARC_TRUE

          if (r . getIpAddress ()) :
               print "%-35s: %-20s"  % ("WARC-IP-Address", r . getIpAddress ())
               m1 = warc.WARC_TRUE

          if ( r . getRefersTo ()) :
               print  "%-35s: %-20s" % ("WARC-Refers-To", r . getRefersTo ())
               m1 = warc.WARC_TRUE

          if (r . getTargetUri ()) :
               print   "%-35s: %-20s" % ( "WARC-Target-URI",r . getTargetUri ())
               m1 = warc.WARC_TRUE

          if (r . getTruncated ()) :
               print  "%-35s: %-20s" % ("WARC-Truncated", r . getTruncated ())
               m1 = warc.WARC_TRUE

          if ( r . getWarcInfoId ()) :
               print  "%-35s: %-20s" % ("WARC-Warcinfo-ID", r . getWarcInfoId ())
               m1 = warc.WARC_TRUE

          if (r . getFileName ()) :
               print "%-35s: %-20s" % ("WARC-Filename:", r . getFileName ())
               m1 = warc.WARC_TRUE

          if (r . getProfile ()) :
               print "%-35s: %-20s" % ("WARC-Profile", r . getProfile ())
               m1 = warc.WARC_TRUE

          if (r . getPayloadType ()) :
               print "%-35s: %-20s" % ("WARC-Identified-Payload-type", r . getPayloadType ())
               m1 = warc.WARC_TRUE

          if (r . getSegmentOriginId  ()) :
                print "%-35s: %-20s" % ("WARC-Segment-Origin-ID", r . getSegmentOriginId  ())
                m1 = warc.WARC_TRUE

          if (r . getSegmentNumber ()) :
                print "%-35s: %-20d" % ("WARC-Segment-Number", r . getSegmentNumber ())
                m1 = warc.WARC_TRUE

          if (r . getSegTotalLength ()) :
                print "%-35s: %-20d" % ("WARC-Segment-Total-Length", r . getSegTotalLength())
                m1 = warc.WARC_TRUE
         
          if (not (m1)) :
                print "--No One --"          

          if (options.verbose) :

                 nb = r . getAnvlFieldsNumber ()
                 if  nb != 0 :
                     i = 0
                     print "-- More Info--\n"
                     while ( i < nb ) :
                         print  "key :  ", r . getAnvlFieldKey (i)
                         print  "Value: ", r . getAnvlFieldValue (i)
                         i = i + 1
                        
          r.destroy()

    w.destroy ()
    return
Esempio n. 15
0
def main () :
    
    usage =  "./app/python/wgetbloc.py <-f file.warc> <-o offset> [-e] [-v] [-t <working_dir>]\n "\
             "\t-f    : valid WARC file name\n"\
             "\t-o    : record offset\n"\
             "\t[-e]  : print HTTP response headers (default 'no')\n"\
             "\t[-t]  : temporary working directory (default './')\n"\
             "./app/python/wgetbloc.py -f foo.warc.gz -n 7"
 
    parser = OptionParser(usage)

    parser.add_option("-f", "--file", dest="filename",
                      help="read data from FILENAME")

    parser.add_option("-o", "--offset", dest="offset",
                      help="record offset", type="int")

    parser.add_option("-e", "--headers",
                    action="store_false", default=True, dest="headers")
    
    parser.add_option("-t", "--tempdir", dest="tmpdir",
                      help="Temporary working directory", default=".")

    (options, args) = parser.parse_args()

    if len (args) != 0 :
       parser.error(" Incorrect arguments")

    if (not (options.filename)) :
        parser.error(" You must give WARC file name")

    if options.offset == None:
        parser.error(" You must provide a valid record offset")

    w = WFile (options.filename, 600 * 1024 * 1024, warc.WARC_FILE_READER, warc.WARC_FILE_DETECT_COMPRESSION, options.tmpdir)

    if (not (w)) :
        print "WARC file  not found "

    # go to the specified offset
    w.seek(options.offset);
    if w . hasMoreRecords ():
        r  = w . nextRecord ()
    else:
        print "End of file reached, or no record at this offset", options.offset
        sys.exit(0);

    # choose your buffer size (ex. 64K = 64 * 1024) to read the payload
    # (with the HTTP headers or not, use the boolean flag) chunk by chunk 
    b = WBloc (w, r, options.headers, 64 * 1024)
    while True:
        buff = b.getNext()
        if buff:
            # the chunk size is returned by calling "b.getLastChunkSize()"
            #sys.stderr.write("chunk size:" + b.getLastChunkSize())
            sys.stdout.write(buff)
        else: # no more data to read. reach the end of record
            break

    b.destroy ()
    r.destroy ()
    w.destroy ()
Esempio n. 16
0
def main():
    global debug
    global tmpdir
    global tmpfile

    usage = " Download list of URLs and store them in WARC container \n\n"\
            + sys.argv[0] + " -f <urls_list> -o <warc_file> [-s <tmpfile>] [-t <tmpdir>] [-q]"

    parser = OptionParser(usage)
    parser.add_option("-f",
                      "--file",
                      dest="input",
                      type="string",
                      help="file name containing a list of URLs")
    parser.add_option("-o",
                      "--output",
                      dest="out",
                      type="string",
                      help="WARC output file name to store documents")
    parser.add_option("-s",
                      "--tmpfile",
                      dest="tmpfile",
                      type="string",
                      help="temporary file (default \".tmpfile\")")
    parser.add_option("-t",
                      "--tmpdir",
                      dest="tmpdir",
                      type="string",
                      help="temporary directory (default \".\")")
    parser.add_option("-q",
                      "--quiet",
                      dest="quiet",
                      action="store_true",
                      help="quiet, no debug")

    (options, args) = parser.parse_args()

    if (options.tmpfile):
        tmpfile = options.tmpfile

    if (options.quiet):
        debug = False

    if (options.input):
        urls = open(options.input).readlines()
    else:
        urls = sys.stdin.readlines()

    if (options.tmpdir):
        tmpdir = options.tmpdir

    if (options.out == None):
        logger("-o <warc_file> option is mandatory\n")
        sys.exit(1)

    # open the WARC file for writing
    cmode = warc.WARC_FILE_COMPRESSED_GZIP
    maxsize = 600 * 1024 * 1024
    w = WFile(options.out, maxsize, warc.WARC_FILE_WRITER, cmode, tmpdir)

    # Make a queue with (url, filename) tuples
    for url in urls:
        url = url.strip()
        if not url or url[0] == "#":
            continue

        # fetch the RL
        effective_url, mime, date, ip, status, redirect = fetcher(url)

        addToWarc(w, effective_url, mime, date, ip)
        logger("%-30s %-3d  %-3d %-30s\n" %
               (url, status, redirect, effective_url))

        os.remove(tmpfile)

    w.destroy()
Esempio n. 17
0
def main():

    usage =  "./app/python/warcdump.py -f <file.warc> [-v] [-t <working_dir>]\n "\
             "\t-f    : valid WARC file name\n"\
             "\t[-t]  : temporary working directory (default './')\n"\
             "\t[-v]  : dump ANVL (default false)"

    parser = OptionParser(usage)

    parser.add_option("-f",
                      "--file",
                      dest="filename",
                      help="read data from FILENAME")

    parser.add_option("-v", "--verbose", action="store_true", dest="verbose")

    parser.add_option("-t",
                      "--tempdir",
                      dest="tmpdir",
                      help="Temporary working directory",
                      default=".")

    (options, args) = parser.parse_args()

    if len(args) != 0:
        parser.error("Incorrect arguments")

    if (not (options.filename)):
        parser.error("You must provide a WARC file name")

    w = WFile(options.filename, 600 * 1024 * 1024, warc.WARC_FILE_READER,
              warc.WARC_FILE_DETECT_COMPRESSION, options.tmpdir)

    if w == None:
        print "Couldn't create a WARC File object"
        return

    print "%-20s %-20s %-10s %-20s %-45s %-44s %-86s " % (
        "Offset", "CSize", "WarcId", "Content-Length", "WARC-Type",
        "WARC-Date", "WARC-Record-ID")

    while (w.hasMoreRecords()):

        r = w.nextRecord()
        if r == None:
            w.destroy()
            print "Couldn't get the WARC record object"
            return

        sys.stdout.write("%-20u " % r.getOffset())

        sys.stdout.write("%-20u " % r.getCompressedSize())

        sys.stdout.write("%-10s " % r.getWarcId())

        sys.stdout.write("%-20u " % r.getContentLength())

        sys.stdout.write("%-45u " % r.getRecordType())

        sys.stdout.write("%-44s " % r.getDate())

        sys.stdout.write("%-86s " % r.getRecordId())
        #########
        m1 = warc.WARC_FALSE
        #########

        if (r.getContentType()):
            print "%-35s: %-20s" % ("Content-Type", r.getContentType())
            m1 = warc.WARC_TRUE

        if (r.getConcurrentTo()):
            print "%-35s: %-20s" % ("WARC-Concurrent-To", r.getConcurrentTo())
            m1 = warc.WARC_TRUE

        if (r.getBlockDigest()):
            print "%-35s: %-20s" % ("WARC-Block-Digest", r.getBlockDigest())
            m1 = warc.WARC_TRUE

        if (r.getPayloadDigest()):
            print "%-35s: %-20s" % ("WARC-Payload-Digest",
                                    r.getPayloadDigest())
            m1 = warc.WARC_TRUE

        if (r.getIpAddress()):
            print "%-35s: %-20s" % ("WARC-IP-Address", r.getIpAddress())
            m1 = warc.WARC_TRUE

        if (r.getRefersTo()):
            print "%-35s: %-20s" % ("WARC-Refers-To", r.getRefersTo())
            m1 = warc.WARC_TRUE

        if (r.getTargetUri()):
            print "%-35s: %-20s" % ("WARC-Target-URI", r.getTargetUri())
            m1 = warc.WARC_TRUE

        if (r.getTruncated()):
            print "%-35s: %-20s" % ("WARC-Truncated", r.getTruncated())
            m1 = warc.WARC_TRUE

        if (r.getWarcInfoId()):
            print "%-35s: %-20s" % ("WARC-Warcinfo-ID", r.getWarcInfoId())
            m1 = warc.WARC_TRUE

        if (r.getFileName()):
            print "%-35s: %-20s" % ("WARC-Filename:", r.getFileName())
            m1 = warc.WARC_TRUE

        if (r.getProfile()):
            print "%-35s: %-20s" % ("WARC-Profile", r.getProfile())
            m1 = warc.WARC_TRUE

        if (r.getPayloadType()):
            print "%-35s: %-20s" % ("WARC-Identified-Payload-type",
                                    r.getPayloadType())
            m1 = warc.WARC_TRUE

        if (r.getSegmentOriginId()):
            print "%-35s: %-20s" % ("WARC-Segment-Origin-ID",
                                    r.getSegmentOriginId())
            m1 = warc.WARC_TRUE

        if (r.getSegmentNumber()):
            print "%-35s: %-20d" % ("WARC-Segment-Number",
                                    r.getSegmentNumber())
            m1 = warc.WARC_TRUE

        if (r.getSegTotalLength()):
            print "%-35s: %-20d" % ("WARC-Segment-Total-Length",
                                    r.getSegTotalLength())
            m1 = warc.WARC_TRUE

        if (not (m1)):
            print "--No One --"

        if (options.verbose):

            nb = r.getAnvlFieldsNumber()
            if nb != 0:
                i = 0
                print "-- More Info--\n"
                while (i < nb):
                    print "key :  ", r.getAnvlFieldKey(i)
                    print "Value: ", r.getAnvlFieldValue(i)
                    i = i + 1

        r.destroy()

    w.destroy()
    return
Esempio n. 18
0
def convert(fname, outfname, tmpdir, cmode):
    a = AFile(fname, arc.ARC_FILE_DETECT_COMPRESSION, tmpdir)

    if (not (a)):
        print "ARC file not found "
        return

    if (cmode):
        cmode = warc.WARC_FILE_COMPRESSED_GZIP
    else:
        cmode = warc.WARC_FILE_UNCOMPRESSED

    w = WFile(outfname, 16 * 1024 * 1024 * 1024, warc.WARC_FILE_WRITER, cmode,
              tmpdir)

    if w == None:
        print "given temporary directory does not exist "
        a.destroy()
        return

    while (a.hasMoreRecords()):

        ar = a.nextRecord()

        if ar == None:
            print "bad ARC file"
            a.destroy()
            w.destroy()
            return

        wr = WRecord()

        if wr == None:
            print "can not create WARC record object"
            a.destroy()
            w.destroy()
            ar.destroy()
            return

        wr.setRecordType(warc.WARC_RESPONSE_RECORD)

        uri = ar.getUrl()
        wr.setTargetUri(uri, len(uri))

        date = ar.getCreationDate()
        wr.setDateFromArc(date, len(date))

        mime = ar.getMimeType()
        wr.setContentType(mime, len(mime))

        ip = ar.getIpAddress()
        wr.setIpAddress(ip, len(ip))

        s = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.localtime())
        sh = sha.new(uri + s)
        rid = sh.hexdigest()
        rid = "uuid:" + rid
        wr.setRecordId(rid, len(rid))

        if (ar.transferContent(wr, a)):
            print "Unable to pass content to the WRecord"
            a.destroy()
            w.destroy()
            ar.destroy()
            return

        if (w.storeRecord(wr)):
            print "failed to write WRecord"
            a.destroy()
            w.destroy()
            ar.destroy()
            return

        ar.destroy()
        wr.destroy()

    a.destroy()
    w.destroy()