Example #1
0
def createDatabaseFromScratch():
    global dbDir
    dbDir = {}
    for d in datadirs:
        if not os.path.exists(d):
            continue
        for basefn in os.listdir(d):
            fn = os.path.join(d, basefn)
            if os.path.isfile(fn) and fn.endswith(".arc.bz2"):
                print fn
                start = time.time()
                arcDir = arcDirGetCacheForDb(d, basefn)
                if not arcDir:
                    print "rebuilding cache for %r" % fn
                    a = ARCive(fn, 'r')
                    arcDir = a.getDir()
                    saveCacheForDb(arcDir, d, basefn)
                print "updating dbdir"
                for url in arcDir:
                    if url not in dbDir:
                        dbDir[url] = []
                    for x in arcDir[url]:
                        dbDir[url].append((x["checksum"], x["date"], x["offset"], fn))
                delta = time.time() - start
                print "dbdirlen=%d, %d urls in %fs, %f url/s" % (len(dbDir), len(arcDir), delta, len(arcDir)/delta)
Example #2
0
def getFile(pos, file, url):
    print url, pos, file
    a = ARCive(file)
    ret = a.readRawDocAtPos(pos)
    a.close()
    return ret