def prep_catalog(): """Download python mailing list, create new catalog and catalog messages, if not done already. """ if not os.path.exists(BENCHMARK_DATA_DIR): os.makedirs(BENCHMARK_DATA_DIR) # Check to see if mailing list data already present if len(get_mailbox_filenames()) == 0: MailListSucker(MAILLIST_INDEX, BENCHMARK_DATA_DIR).suck() # Create ZODB and index maillist messages, if not yet done zodb_file = os.path.join(BENCHMARK_DATA_DIR, 'test.zodb') if not os.path.exists(zodb_file): # Create a catalog manager = ConnectionManager() factory = FileStorageCatalogFactory( os.path.join(BENCHMARK_DATA_DIR, 'test.zodb'), 'benchmark') c = factory(manager) # Create some indices c['subject'] = CatalogFieldIndex(get_subject) c['date'] = CatalogFieldIndex(get_date) c['sender_email'] = CatalogFieldIndex(get_sender_email) c['topics'] = CatalogFacetIndex(get_topics, topic_taxonomy) c['text'] = CatalogTextIndex(get_text) manager.commit() # Loop over messages to get base line profiler.start("Loop over messages without indexing") for _ in MessageIterator(): pass profiler.stop("Loop over messages without indexing") profiler.start("Index messages") id = 1 for msg in MessageIterator(): c.index_doc(id, msg) id += 1 if id / 100 == 0: manager.commit() manager.commit() manager.close() profiler.stop("Index messages") print("Indexed %d messages" % id)
def run(): # Download mailbox archive of python mailing list and build # catalog if needed prep_catalog() # Open a catalog manager = ConnectionManager() factory = FileStorageCatalogFactory( os.path.join(BENCHMARK_DATA_DIR, 'test.zodb'), 'benchmark') c = factory(manager) # Do some searches profiler.start("unsorted retrieval") n, results = c.search(date=('0', 'Z')) print('%d results ' % n) # Force generator to marshall brains for result in results: pass profiler.stop("unsorted retrieval") profiler.start("repeat unsorted retrieval") n, results = c.search(date=('0', 'Z')) print('%d results ' % n) # Force generator to marshall brains for result in results: pass profiler.stop("repeat unsorted retrieval") profiler.start("sorted retrieval") n, results = c.search(date=('0', 'Z'), sort_index='subject') print('%d results ' % n) for result in results: pass profiler.stop("sorted retrieval") profiler.start("reverse sorted retrieval") n, results = c.search(date=('0', 'Z'), sort_index='subject', reverse=True) print('%d results ' % n) for result in results: pass profiler.stop("reverse sorted retrieval") profiler.start('limit to topic=year:2000') n, results = c.search(topics=['year:2000']) print('%d results' % n) L = [] for result in results: L.append(result) profiler.stop("limit to topic=year:2000") profiler.start('count limited to topic=year:2000') print(c['topics'].counts(L, ['year:2000'])) profiler.stop('count limited to topic=year:2000') profiler.stop() profiler.print_stack()
def prep_catalog(): """Download python mailing list, create new catalog and catalog messages, if not done already. """ if not os.path.exists(BENCHMARK_DATA_DIR): os.makedirs(BENCHMARK_DATA_DIR) # Check to see if mailing list data already present if len(get_mailbox_filenames()) == 0: MailListSucker(MAILLIST_INDEX,BENCHMARK_DATA_DIR).suck() # Create ZODB and index maillist messages, if not yet done zodb_file = os.path.join(BENCHMARK_DATA_DIR, 'test.zodb') if not os.path.exists(zodb_file): # Create a catalog manager = ConnectionManager() factory = FileStorageCatalogFactory( os.path.join(BENCHMARK_DATA_DIR, 'test.zodb'), 'benchmark' ) c = factory(manager) # Create some indices c['subject'] = CatalogFieldIndex(get_subject) c['date'] = CatalogFieldIndex(get_date) c['sender_email'] = CatalogFieldIndex(get_sender_email) c['topics'] = CatalogFacetIndex(get_topics, topic_taxonomy) c['text'] = CatalogTextIndex(get_text) manager.commit() # Loop over messages to get base line profiler.start( "Loop over messages without indexing" ) for _ in MessageIterator(): pass profiler.stop( "Loop over messages without indexing" ) profiler.start( "Index messages" ) id = 1 for msg in MessageIterator(): c.index_doc(id,msg) id += 1 if id / 100 == 0: manager.commit() manager.commit() manager.close() profiler.stop( "Index messages" ) print "Indexed %d messages" % id
def do_benchmark(fname, nd, nk1, nk2, out=sys.stdout): cumulative1 = 0.0 cumulative2 = 0.0 print("Index 1:", file=out) print("\t# docs: %d" % nd, file=out) print("\t# distinct keys: %d" % nk1, file=out) print("Index 2:", file=out) print("\t# docs: %d" % nd, file=out) print("\t# distinct keys: %d" % nk2, file=out) print("", file=out) cost1, cost2 = predictions(nd, nk1, nk2) print('Cost1: %0.2f' % cost1, file=out) print('Cost2: %0.2f' % cost2, file=out) print("Prediction:", file=out) if cost1 > cost2: print("Algorithm 2 %0.2f times faster than Algorithm 1" % (cost1 / cost2), file=out) else: print("Algorithm 1 %0.2f times faster than Algorithm 2" % (cost2 / cost1), file=out) print("", file=out) print("Setting up indexes...", file=out) for fn in glob.glob(fname + "*"): os.remove(fn) manager = ConnectionManager() factory = FileStorageCatalogFactory(fname, 'intersection') catalog = factory(manager) catalog['one'] = CatalogFieldIndex('one') catalog['two'] = CatalogFieldIndex('two') class Document(object): def __init__(self, docid): self.one = str(docid % nk1) self.two = str(docid % nk2) for docid in xrange(nd): catalog.index_doc(docid, Document(docid)) manager.commit() manager.close() N_QUERIES = 1000 print("Running %d queries for each algorithm..." % N_QUERIES, file=out) catalog = factory(manager) for _ in xrange(1000): key1 = random.randrange(nk1) key2 = random.randrange(nk2) query1 = Intersection1(Eq('one', str(key1)), Eq('two', str(key2))) query2 = Intersection2(Eq('one', str(key1)), Eq('two', str(key2))) start = time.time() result1 = query1.apply(catalog) cumulative1 += time.time() - start start = time.time() result2 = query2.apply(catalog) cumulative2 += time.time() - start s1 = sorted(list(result1)) s2 = sorted(list(result2)) assert s1 == s2, (s1, s2) manager.close() for fn in glob.glob(fname + "*"): os.remove(fn) print("", file=out) print("Result:", file=out) print("Time for algorithm1: %0.3f s" % cumulative1, file=out) print("Time for algorithm2: %0.3f s" % cumulative2, file=out) if cumulative1 > cumulative2: print("Algorithm 2 %0.2f times faster than Algorithm 1" % (cumulative1 / cumulative2), file=out) else: print("Algorithm 1 %0.2f times faster than Algorithm 2" % (cumulative2 / cumulative1), file=out) return cost1 / cost2, cumulative1 / cumulative2
def do_benchmark(fname, nd, nk1, nk2, out=sys.stdout): cumulative1 = 0.0 cumulative2 = 0.0 print >>out, "Index 1:" print >>out, "\t# docs: %d" % nd print >>out, "\t# distinct keys: %d" % nk1 print >>out, "Index 2:" print >>out, "\t# docs: %d" % nd print >>out, "\t# distinct keys: %d" % nk2 print >>out, "" cost1, cost2 = predictions(nd, nk1, nk2) print >>out, 'Cost1: %0.2f' % cost1 print >>out, 'Cost2: %0.2f' % cost2 print >>out print >>out, "Prediction:" if cost1 > cost2: print >>out, "Algorithm 2 %0.2f times faster than Algorithm 1" % ( cost1/cost2) else: print >>out, "Algorithm 1 %0.2f times faster than Algorithm 2" % ( cost2/cost1) print >>out, "" print >>out, "Setting up indexes..." for fn in glob.glob(fname + "*"): os.remove(fn) manager = ConnectionManager() factory = FileStorageCatalogFactory(fname, 'intersection') catalog = factory(manager) catalog['one'] = CatalogFieldIndex('one') catalog['two'] = CatalogFieldIndex('two') class Document(object): def __init__(self, docid): self.one = str(docid % nk1) self.two = str(docid % nk2) for docid in xrange(nd): catalog.index_doc(docid, Document(docid)) manager.commit() manager.close() N_QUERIES = 1000 print >>out, "Running %d queries for each algorithm..." % N_QUERIES catalog = factory(manager) for _ in xrange(1000): key1 = random.randrange(nk1) key2 = random.randrange(nk2) query1 = Intersection1(Eq('one', str(key1)), Eq('two', str(key2))) query2 = Intersection2(Eq('one', str(key1)), Eq('two', str(key2))) start = time.time() result1 = query1.apply(catalog) cumulative1 += time.time() - start start = time.time() result2 = query2.apply(catalog) cumulative2 += time.time() - start s1 = sorted(list(result1)) s2 = sorted(list(result2)) assert s1==s2, (s1, s2) manager.close() for fn in glob.glob(fname + "*"): os.remove(fn) print >>out, "" print >>out, "Result:" print >>out, "Time for algorithm1: %0.3f s" % cumulative1 print >>out, "Time for algorithm2: %0.3f s" % cumulative2 if cumulative1 > cumulative2: print >>out, "Algorithm 2 %0.2f times faster than Algorithm 1" % ( cumulative1/cumulative2) else: print >>out, "Algorithm 1 %0.2f times faster than Algorithm 2" % ( cumulative2/cumulative1) return cost1 / cost2, cumulative1 / cumulative2