def leave_one_out(definition): initial_time = datetime.datetime.now() blacklist = Utils.init_blacklist(definition.blacklist_path()) col_stats = ColumnStatsD.fromFile(definition.stats_path()) query_num = definition.query_num() dataset_dict = None # Note: the commented code below does not work because # writeToFile, and loadFromFile are broken. When they are fixed # this should speed up the whole procedure a bit, because we will # not need to parse a big trace file. # if os.path.exists(definition['model_file']) and os.path.isfile(definition['model_file']): # try: # dataset_dict = MalDictionary.loadFromFile(definition['model_file']) # except: # logging.warning('Could not load model file: {}. Rebuilding.'.format(definition['model_file'])) # dataset_dict = None if dataset_dict is None: print('Loading traces for query: {:02}...'.format(query_num), end='') sys.stdout.flush() load_start = datetime.datetime.now() dataset_dict = MalDictionary.fromJsonFile(definition.data_file(), blacklist, col_stats) load_end = datetime.datetime.now() print('Done: {}'.format(load_end - load_start)) # dataset_dict.writeToFile(definition['model_file']) errors = list() pl = open(definition.result_file(), 'w') cnt = 0 total = len(dataset_dict.query_tags) for leaveout_tag in dataset_dict.query_tags: iter_start = datetime.datetime.now() print("\b\b\b\b", end='') print('{:03}%'.format(int(100 * cnt / total)), end='') sys.stdout.flush() cnt += 1 test_dict = dataset_dict.filter(lambda x: x.tag == leaveout_tag) train_dict = dataset_dict.filter(lambda x: x.tag != leaveout_tag) graph = test_dict.buildApproxGraph(train_dict) predict_start = datetime.datetime.now() predicted_mem = test_dict.predictMaxMem(graph) actual_mem = test_dict.getMaxMem() iter_end = datetime.datetime.now() errors.append(100 * (predicted_mem - actual_mem) / actual_mem) pl.write("{} {} {}\n".format(iter_end - iter_start, iter_end - predict_start, errors[cnt - 1])) print("") outfile = definition.out_path('Q{:02}_memerror.pdf'.format(query_num)) print() pl.close() Utils.plotLine(numpy.arange(1, cnt), errors, outfile, 'Error percent', 'Leave out query')
def plot_select_error_air(db, q, trainq=None, path="", ntrain=1000, step=25, output=None): assert db == 'tpch10' or db == 'airtraffic' blacklist = Utils.init_blacklist("config/mal_blacklist.txt") col_stats = ColumnStatsD.fromFile('config/{}_stats.txt'.format(db)) if trainq is None: trainq = q e = [] logging.info("Examining Query: {}".format(q)) logging.info("loading training set...") trainf = "traces/random_{db}/ran_q{q}_n{n}_{db}.json".format(db=db, q=trainq, n=ntrain) traind = MalDictionary.fromJsonFile(trainf, blacklist, col_stats) logging.info("loading test set...") testf = "traces/{}/{}.json".format(db, q) testd = MalDictionary.fromJsonFile(testf, blacklist, col_stats) # filter only select instructions seld = testd.filter(lambda ins: ins.fname in ['select', 'thetaselect']) seli = seld.getInsList() train_tags = traind.query_tags train_tags.sort() e = [] ind = [] # kutsurak: This loop increases the queries we use to train the # model. for i in range(1, ntrain + 2, step): d12 = traind.filter(lambda ins: ins.tag in train_tags[0:i]) print(len(d12.query_tags)) pG = testd.buildApproxGraph(d12) error = 0 for ins in seli: p = ins.predict(d12, pG)[0] cnt = ins.ret_size pc = p.getMem() # we use abs so that the errors do not cancel out if cnt > 0: error += 100 * abs((pc - cnt) / cnt) e.append(error / len(seli)) ind.append(i) print("error array:", e) outpdf = path + '{}_sel{}_error.pdf'.format( db, q) if output is None else output Utils.plotLine(ind, e, outpdf, 'Error perc', 'Nof training queries')
def plot_mem_error_air(db, q, trainq=None, path="", output=None, ntrain=1000, step=25): blacklist = Utils.init_blacklist("config/mal_blacklist.txt") col_stats = ColumnStatsD.fromFile('config/{}_stats.txt'.format(db)) if trainq is None: trainq = q e = [] logging.info("Examining Query: {}".format(q)) logging.info("loading training set...") trainf = "traces/random_{db}/ran_q{q}_n{n}_{db}.json".format(db=db, q=trainq, n=ntrain) traind = MalDictionary.fromJsonFile(trainf, blacklist, col_stats) logging.info("loading test set...") testf = "traces/{}/{}.json".format(db, q) testd = MalDictionary.fromJsonFile(testf, blacklist, col_stats) train_tags = traind.query_tags train_tags.sort() e = [] ind = [] for i in range(1, ntrain + 2, step): d12 = traind.filter(lambda ins: ins.tag in train_tags[0:i]) print(len(d12.query_tags)) pG = testd.buildApproxGraph(d12) pmm = testd.predictMaxMem(pG) mm = testd.getMaxMem() # print(pmm / 1000000, mm / 1000000) e.append(100 * ((pmm - mm) / mm)) ind.append(i) print(e) outf = path + '{}_q{}_memerror.pdf'.format(db, q) if output is None else output Utils.plotLine(ind, e, outf, 'Error perc', 'Nof training queries')