def createDatabaseFromScratch(): global dbDir dbDir = {} for d in datadirs: if not os.path.exists(d): continue for basefn in os.listdir(d): fn = os.path.join(d, basefn) if os.path.isfile(fn) and fn.endswith(".arc.bz2"): print fn start = time.time() arcDir = arcDirGetCacheForDb(d, basefn) if not arcDir: print "rebuilding cache for %r" % fn a = ARCive(fn, 'r') arcDir = a.getDir() saveCacheForDb(arcDir, d, basefn) print "updating dbdir" for url in arcDir: if url not in dbDir: dbDir[url] = [] for x in arcDir[url]: dbDir[url].append((x["checksum"], x["date"], x["offset"], fn)) delta = time.time() - start print "dbdirlen=%d, %d urls in %fs, %f url/s" % (len(dbDir), len(arcDir), delta, len(arcDir)/delta)
def getFile(pos, file, url): print url, pos, file a = ARCive(file) ret = a.readRawDocAtPos(pos) a.close() return ret