def run(self): for DATASET in self.DATASETS: print "==== Importing Dataset %s === " % DATASET["name"] XRO = XMLReader.from_dataset(DATASET) stats = XRO.get_stats() PRINT( stats ) LOGCSV.writerow(["Dataset Stats Keys", "", XRO.name] + stats.keys()) LOGCSV.writerow(["Dataset Stats Values", "", XRO.name] + stats.values()) for DBO in self.DBOS: print "Current Db: ", DBO.info print "* reset tables" DBO.reset() print "* import dataset" DBO.import_XML(XRO) print "==== Retrieval Benchmark ====" print "* Generating samples" warmup_sample = XRO.get_thread_sample(RUNS) thread_sample = XRO.get_thread_sample(RUNS) self.LOG_THREADS(thread_sample) LOGCSV.writerow(["Thread Lengths", "", XRO.name] + [ XRO.get_thread_length(thread) for thread in thread_sample] ) print "* Average Thread Length:", sum([ XRO.get_thread_length(thread) for thread in thread_sample])/float(RUNS) for DBO in self.DBOS: print "* Current Db: ", DBO.info # Warmup run print "* warming up caches" self.retrieval_benchmark(DBO, warmup_sample, silent = True) # Benchmark results = self.retrieval_benchmark(DBO, thread_sample) LOGCSV.writerow(["Retrieval Ticks", DBO.info, DBO.XRO.name] + results ) print "=== Edit Post Benchmark ===" print "* Generating samples" post_sample = XRO.get_post_sample(1000) for DBO in self.DBOS: self.edit_post_benchmark(DBO, post_sample) #print "* Benchmark Add Post" #self.add_post_benchmark(DBO) for DBO in DBOS: DBO.close()
def __init__(self, host = 'localhost', port = 27017): self.con = pymongo.MongoClient(host, port) self.db = self.con[self.DB_NAME] self.tc = self.db[self.COL_THREAD] self.uc = self.db[self.COL_USER] self.XRO = XMLReader()
class MongoControls: # We follow the tutorial # http://api.mongodb.org/python/2.4.1/tutorial.html # Remark: # documents are members of collections # collections are members of databases info = "Mongo" DB_NAME = "discuss" COL_THREAD = "threads" COL_USER = "******" def __init__(self, host = 'localhost', port = 27017): self.con = pymongo.MongoClient(host, port) self.db = self.con[self.DB_NAME] self.tc = self.db[self.COL_THREAD] self.uc = self.db[self.COL_USER] self.XRO = XMLReader() def close(self): pass def reset(self): self.tc.drop() self.uc.drop() def insert_thread(self, data): # Insert data into thread collection # returns id try: id = self.tc.insert(data) except pymongo.errors.DuplicateKeyError: id = None return id def insert_user(self, data): # Insert data into user collection # returns id try: id = self.uc.insert(data) except pymongo.errors.DuplicateKeyError: id = None return id def get_thread(self, ID): thread_rec = self.tc.find_one({"_id": ID}) user_list = [] try: for post in thread_rec["posts"]: user_list.append([post["userID"] + USEROFFSET, self.uc.find_one({"_id": post["userID"] + USEROFFSET})]) except KeyError: pass return {"thread":thread_rec, "users": user_list} def get_user(self, ID): return self.uc.find_one({"_id": ID}) def fill_users(self, batch_size = 5000): buffer = [] for user in self.XRO.get_users(): user["_id"] = user["ID"] + USEROFFSET buffer.append(user) if len(buffer) == batch_size: if DEBUG: print "Writing user collection", batch_size self.insert_user(buffer) buffer = [] self.insert_user(buffer) def fill_threads(self, batch_size = 5000): buffer = [] for thread in self.XRO.get_complete_threads(): thread["_id"] = thread["ID"] buffer.append(thread) if len(buffer) == batch_size: if DEBUG: print "Writing thread collection", batch_size self.insert_thread(buffer) buffer = [] self.insert_thread(buffer) def create_indices(self): self.uc.create_index("ID") self.tc.create_index("ID") @TimeDec def import_XML(self, XRO): self.XRO = XRO self.fill_threads() self.fill_users() def TEST(self): global DEBUG DEBUG = 1 # print "Creating test documents" # tid = self.insert_thread({"thread":"test", "ID": -1}) # uid = self.insert_user({"user":"******", "ID": -1}) # # print self.get_thread(-1) # print self.get_user(-2) self.reset() print "Populating MongoDB" self.fill_users(10000) self.fill_threads() print self.get_thread(3) print self.get_user(3) print "Creating Indices" self.create_indices()