def runCachingExperiment(name, data_dir): print("Running caching experiment for '" + name + "'.") memoryPerWorker = "21g" cores = 143 timesToRun = 4 tps = 1500000 outFilePath = data_dir + "/caching/" + name + ".yaml" if os.path.isfile(outFilePath): with open(outFilePath, "r") as f: data = yaml.load(f) else: data = {} for cache in [True,False]: if cache not in data: data[cache] = [] if len(data[cache]) >= timesToRun: print(" Already profiled for cache = " + str(cache) + ", skipping.") continue while len(data[cache]) < timesToRun: try: bench_base.restartServers() bench_base.restartSparkContext(memoryPerWorker, cores) # Load the data into cache. if cache: bench_base.runQuery(name,"2014-01-01","2014-01-07",cache,tps) while len(data[cache]) < timesToRun: result = bench_base.runQuery(name,"2014-01-01","2014-01-07",cache,tps) data[cache].append(result[2]['TimeMillis'] - result[0]['TimeMillis']) with open(outFilePath, "w") as f: f.write(yaml.dump(data, indent=2, default_flow_style=False)) except KeyboardInterrupt: sys.exit(-1) except Exception: print("Exception occurred, retrying.") traceback.print_exc() if not cache: data[cache] = [] pass return data
def runScalingExperiment(queries, data_dir): print("Running scaling experiment.") memoryPerWorker = "21g" timesToRun = 4 tps = 1500000 allNodes = [6,5,4,3] cache = True outFilePath = data_dir + "/scaling/scaling.yaml" if os.path.isfile(outFilePath): with open(outFilePath, "r") as f: data = yaml.load(f) else: data = {} for nodes in allNodes: cores = nodes*24-1 if nodes not in data: data[nodes] = {'dataLoad': [], 'execution': {}} if len(data[nodes]['dataLoad']) < timesToRun or \ len(data[nodes]['execution']) < len(queries): input("Please ensure {} Spark and HDFS nodes are running and press any key to continue.".format(nodes)) while len(data[nodes]['dataLoad']) < timesToRun or \ len(data[nodes]['execution']) < len(queries): try: bench_base.restartServers() bench_base.restartSparkContext(memoryPerWorker, cores) while len(data[nodes]['dataLoad']) < timesToRun: result = bench_base.getDataLoadTime() data[nodes]['dataLoad'].append( result[1]['TimeMillis'] - result[0]['TimeMillis'] ) with open(outFilePath, "w") as f: f.write(yaml.dump(data, indent=2, default_flow_style=False)) for query in queries: exeData = data[nodes]['execution'] if query not in exeData: exeData[query] = [] if len(exeData[query]) >= timesToRun: print(" Already profiled for nodes = " + str(nodes) + ", skipping.") continue while len(exeData[query]) < timesToRun: # Load the data into cache. if cache: bench_base.runQuery(query,"2014-01-01","2014-01-07",cache,tps) while len(exeData[query]) < timesToRun: result = bench_base.runQuery( query,"2014-01-01","2014-01-07",cache,tps ) exeData[query].append( result[2]['TimeMillis'] - result[0]['TimeMillis'] ) with open(outFilePath, "w") as f: f.write(yaml.dump(data, indent=2, default_flow_style=False)) except KeyboardInterrupt: sys.exit(-1) except Exception: print("Exception occurred, retrying.") traceback.print_exc() if not cache: data[cache] = [] pass return data
def runPartitionExperiment(name, data_dir): print("Running partition experiment for '" + name + "'.") targetPartitionSizes = [ 10000, 50000, 100000, 200000, 300000, 400000, 500000, 750000, 1000000, 1500000 ] memoryPerWorker = "21g" cores = 143 timesToRun = 4 cache = True outFilePath = data_dir + "/partitions/" + name + ".yaml" if os.path.isfile(outFilePath): with open(outFilePath, "r") as f: data = yaml.load(f) else: data = {} for tps in targetPartitionSizes: if tps not in data: data[tps] = [] if len(data[tps]) >= timesToRun: print(" Already profiled for " + str(tps) + " targetPartitionSizes, skipping.") continue while len(data[tps]) < timesToRun: try: bench_base.restartServers() bench_base.restartSparkContext(memoryPerWorker, cores) # Load the data into cache. if cache: bench_base.runQuery(name, "2014-01-01", "2014-01-07", cache, tps) while len(data[tps]) < timesToRun: result = bench_base.runQuery(name, "2014-01-01", "2014-01-07", cache, tps) data[tps].append(result[2]['TimeMillis'] - result[0]['TimeMillis']) with open(outFilePath, "w") as f: f.write(yaml.dump(data, indent=2, default_flow_style=False)) except KeyboardInterrupt: sys.exit(-1) except Exception: print("Exception occurred, retrying.") traceback.print_exc() if not cache: data[tps] = [] pass return data
def runPartitionExperiment(name, data_dir): print("Running partition experiment for '" + name + "'.") targetPartitionSizes = [10000, 50000, 100000, 200000, 300000, 400000, 500000, 750000, 1000000, 1500000] memoryPerWorker = "21g" cores = 143 timesToRun = 4 cache = True outFilePath = data_dir + "/partitions/" + name + ".yaml" if os.path.isfile(outFilePath): with open(outFilePath, "r") as f: data = yaml.load(f) else: data = {} for tps in targetPartitionSizes: if tps not in data: data[tps] = [] if len(data[tps]) >= timesToRun: print(" Already profiled for " + str(tps) + " targetPartitionSizes, skipping.") continue while len(data[tps]) < timesToRun: try: bench_base.restartServers() bench_base.restartSparkContext(memoryPerWorker, cores) # Load the data into cache. if cache: bench_base.runQuery(name,"2014-01-01","2014-01-07",cache,tps) while len(data[tps]) < timesToRun: result = bench_base.runQuery(name,"2014-01-01","2014-01-07",cache,tps) data[tps].append(result[2]['TimeMillis'] - result[0]['TimeMillis']) with open(outFilePath, "w") as f: f.write(yaml.dump(data, indent=2, default_flow_style=False)) except KeyboardInterrupt: sys.exit(-1) except Exception: print("Exception occurred, retrying.") traceback.print_exc() if not cache: data[tps] = [] pass return data
def runScalingExperiment(queries, data_dir): print("Running scaling experiment.") memoryPerWorker = "21g" timesToRun = 4 tps = 1500000 allNodes = [6, 5, 4, 3] cache = True outFilePath = data_dir + "/scaling/scaling.yaml" if os.path.isfile(outFilePath): with open(outFilePath, "r") as f: data = yaml.load(f) else: data = {} for nodes in allNodes: cores = nodes * 24 - 1 if nodes not in data: data[nodes] = {'dataLoad': [], 'execution': {}} if len(data[nodes]['dataLoad']) < timesToRun or \ len(data[nodes]['execution']) < len(queries): input( "Please ensure {} Spark and HDFS nodes are running and press any key to continue." .format(nodes)) while len(data[nodes]['dataLoad']) < timesToRun or \ len(data[nodes]['execution']) < len(queries): try: bench_base.restartServers() bench_base.restartSparkContext(memoryPerWorker, cores) while len(data[nodes]['dataLoad']) < timesToRun: result = bench_base.getDataLoadTime() data[nodes]['dataLoad'].append(result[1]['TimeMillis'] - result[0]['TimeMillis']) with open(outFilePath, "w") as f: f.write(yaml.dump(data, indent=2, default_flow_style=False)) for query in queries: exeData = data[nodes]['execution'] if query not in exeData: exeData[query] = [] if len(exeData[query]) >= timesToRun: print(" Already profiled for nodes = " + str(nodes) + ", skipping.") continue while len(exeData[query]) < timesToRun: # Load the data into cache. if cache: bench_base.runQuery(query, "2014-01-01", "2014-01-07", cache, tps) while len(exeData[query]) < timesToRun: result = bench_base.runQuery( query, "2014-01-01", "2014-01-07", cache, tps) exeData[query].append(result[2]['TimeMillis'] - result[0]['TimeMillis']) with open(outFilePath, "w") as f: f.write( yaml.dump(data, indent=2, default_flow_style=False)) except KeyboardInterrupt: sys.exit(-1) except Exception: print("Exception occurred, retrying.") traceback.print_exc() if not cache: data[cache] = [] pass return data
def runConcurrentExperiment(name, data_dir): global timesToRun print("Running concurrent experiment for '" + name + "'.") allConcurrentQueries = list(range(1,9)) cores = 143 memoryPerWorker = "20g" tps = 1500000 def isDataFinished(concurrentQueries,d): if not d or not isinstance(d,list) or len(d)<concurrentQueries: return False for thread in d: if len(thread) < timesToRun: return False return True def run(concurrentQueries): g_lock = Lock() def threadEntry(threadNum): def isFinished(): with g_lock: for n in results: if len(n) < timesToRun: return False return True try: while not isFinished(): print(str(threadNum) + ": Calling query.") result = bench_base.runQuery(name, "2014-01-01", "2014-01-07", True) queryExecutionTime = result[2]['TimeMillis']-result[0]['TimeMillis'] print(str(threadNum) + ": Query execution time: " + str(queryExecutionTime)) with g_lock: results[threadNum].append(queryExecutionTime) except: print("Error occurred in thread.") traceback.print_exc() results = [[] for x in range(0,concurrentQueries)] threads = [ Thread(target=threadEntry, args=(i,)) for i in range(0,concurrentQueries) ] [t.start() for t in threads] [t.join() for t in threads] return results outFilePath = data_dir + "/concurrent/" + name + ".yaml" if os.path.isfile(outFilePath): with open(outFilePath, "r") as f: data = yaml.load(f) else: data = {} for concurrentQueries in allConcurrentQueries: if concurrentQueries in data and \ isDataFinished(concurrentQueries,data[concurrentQueries]): print(" Already profiled for " + str(concurrentQueries) + " concurrent queries, skipping.") continue else: data[concurrentQueries] = {} while not isDataFinished(concurrentQueries,data[concurrentQueries]): try: bench_base.restartServers() bench_base.restartSparkContext(memoryPerWorker, cores) # For cache. bench_base.runQuery(name, "2014-01-01", "2014-01-07", True, tps) data[concurrentQueries] = run(concurrentQueries) with open(outFilePath, "w") as f: f.write(yaml.dump(data, indent=2, default_flow_style=False)) except KeyboardInterrupt: sys.exit(-1) except Exception: print("Exception occurred, retrying.") traceback.print_exc() data[concurrentQueries] = {} pass return data
def runConcurrentExperiment(name, data_dir): global timesToRun print("Running concurrent experiment for '" + name + "'.") allConcurrentQueries = list(range(1, 9)) cores = 143 memoryPerWorker = "20g" tps = 1500000 def isDataFinished(concurrentQueries, d): if not d or not isinstance(d, list) or len(d) < concurrentQueries: return False for thread in d: if len(thread) < timesToRun: return False return True def run(concurrentQueries): g_lock = Lock() def threadEntry(threadNum): def isFinished(): with g_lock: for n in results: if len(n) < timesToRun: return False return True try: while not isFinished(): print(str(threadNum) + ": Calling query.") result = bench_base.runQuery(name, "2014-01-01", "2014-01-07", True) queryExecutionTime = result[2]['TimeMillis'] - result[0][ 'TimeMillis'] print( str(threadNum) + ": Query execution time: " + str(queryExecutionTime)) with g_lock: results[threadNum].append(queryExecutionTime) except: print("Error occurred in thread.") traceback.print_exc() results = [[] for x in range(0, concurrentQueries)] threads = [ Thread(target=threadEntry, args=(i, )) for i in range(0, concurrentQueries) ] [t.start() for t in threads] [t.join() for t in threads] return results outFilePath = data_dir + "/concurrent/" + name + ".yaml" if os.path.isfile(outFilePath): with open(outFilePath, "r") as f: data = yaml.load(f) else: data = {} for concurrentQueries in allConcurrentQueries: if concurrentQueries in data and \ isDataFinished(concurrentQueries,data[concurrentQueries]): print(" Already profiled for " + str(concurrentQueries) + " concurrent queries, skipping.") continue else: data[concurrentQueries] = {} while not isDataFinished(concurrentQueries, data[concurrentQueries]): try: bench_base.restartServers() bench_base.restartSparkContext(memoryPerWorker, cores) # For cache. bench_base.runQuery(name, "2014-01-01", "2014-01-07", True, tps) data[concurrentQueries] = run(concurrentQueries) with open(outFilePath, "w") as f: f.write(yaml.dump(data, indent=2, default_flow_style=False)) except KeyboardInterrupt: sys.exit(-1) except Exception: print("Exception occurred, retrying.") traceback.print_exc() data[concurrentQueries] = {} pass return data