def plot_max_mem_error(q): blacklist = Utils.init_blacklist("config/mal_blacklist.txt") col_stats = ColumnStatsD.fromFile('config/tpch_sf10_stats.txt') for qno in q: logging.info("Testing query {}".format(qno)) q = "{}".format(qno) if qno < 10: q = "0{}".format(q) logging.info("loading training set...") d1 = MalDictionary.fromJsonFile( "traces/random_tpch_sf10/ran{}_200_sf10.json".format(qno), blacklist, col_stats) logging.info("loading test set...") d2 = MalDictionary.fromJsonFile("traces/tpch-sf10/{}.json".format(q), blacklist, col_stats) train_tags = d1.query_tags train_tags.sort() e = [] ind = [] for i in [ 1, 5, 10, 15, 20, 25, 30, 40, 50, 75, 100, 125, 150, 175, 200 ]: d12 = d1.filter(lambda ins: ins.tag in train_tags[0:i]) print(len(d12.query_tags)) pG = d2.buildApproxGraph(d12) pmm = d2.predictMaxMem(pG) / 1000000000 mm = d2.getMaxMem() / 1000000000 e.append(100 * abs((pmm - mm) / mm)) ind.append(i) print(e) Utils.plotBar(ind, e, "results/memf_error_q{}.pdf".format(qno), 'nof training queries', 'error perc')
def plot_allmem_tpch10(path=""): blacklist = Utils.init_blacklist("config/mal_blacklist.txt") col_stats = ColumnStatsD.fromFile('config/tpch_sf10_stats.txt') e = [] for qno in range(1, 23): q = "0{}".format(qno) if qno < 10 else "{}".format(qno) logging.info("Examining Query: {}".format(q)) d1 = MalDictionary.fromJsonFile( "traces/random_tpch10/ran_q{}_n200_tpch10.json".format(q), blacklist, col_stats) d2 = MalDictionary.fromJsonFile("traces/tpch10/{}.json".format(q), blacklist, col_stats) pG = d2.buildApproxGraph(d1) pmm = d2.predictMaxMem(pG) / 1_000_000_000 mm = d2.getMaxMem() / 1_000_000_000 err = 100 * abs((pmm - mm) / mm) print("query: {}, pred mem: {}, actual mem: {}, error {}".format( qno, pmm, mm, err)) e.append(err) print(err) # TODO: use os.path.join for the following outf = path + "mem_error_1-23.pdf" Utils.plotBar(range(1, 23), e, outf, 'error perc', 'query no')
def predict(definition): blacklist = Utils.init_blacklist(definition.blacklist_path()) col_stats = Utils.init_blacklist(definition.stats_path()) model = load_model(definition) plan = definition.demo_plan_file() plan_dict = MalDictionary.fromJsonFile(plan, blacklist, col_stats)
def leave_one_out(definition): initial_time = datetime.datetime.now() blacklist = Utils.init_blacklist(definition.blacklist_path()) col_stats = ColumnStatsD.fromFile(definition.stats_path()) query_num = definition.query_num() dataset_dict = None # Note: the commented code below does not work because # writeToFile, and loadFromFile are broken. When they are fixed # this should speed up the whole procedure a bit, because we will # not need to parse a big trace file. # if os.path.exists(definition['model_file']) and os.path.isfile(definition['model_file']): # try: # dataset_dict = MalDictionary.loadFromFile(definition['model_file']) # except: # logging.warning('Could not load model file: {}. Rebuilding.'.format(definition['model_file'])) # dataset_dict = None if dataset_dict is None: print('Loading traces for query: {:02}...'.format(query_num), end='') sys.stdout.flush() load_start = datetime.datetime.now() dataset_dict = MalDictionary.fromJsonFile(definition.data_file(), blacklist, col_stats) load_end = datetime.datetime.now() print('Done: {}'.format(load_end - load_start)) # dataset_dict.writeToFile(definition['model_file']) errors = list() pl = open(definition.result_file(), 'w') cnt = 0 total = len(dataset_dict.query_tags) for leaveout_tag in dataset_dict.query_tags: iter_start = datetime.datetime.now() print("\b\b\b\b", end='') print('{:03}%'.format(int(100 * cnt / total)), end='') sys.stdout.flush() cnt += 1 test_dict = dataset_dict.filter(lambda x: x.tag == leaveout_tag) train_dict = dataset_dict.filter(lambda x: x.tag != leaveout_tag) graph = test_dict.buildApproxGraph(train_dict) predict_start = datetime.datetime.now() predicted_mem = test_dict.predictMaxMem(graph) actual_mem = test_dict.getMaxMem() iter_end = datetime.datetime.now() errors.append(100 * (predicted_mem - actual_mem) / actual_mem) pl.write("{} {} {}\n".format(iter_end - iter_start, iter_end - predict_start, errors[cnt - 1])) print("") outfile = definition.out_path('Q{:02}_memerror.pdf'.format(query_num)) print() pl.close() Utils.plotLine(numpy.arange(1, cnt), errors, outfile, 'Error percent', 'Leave out query')
def plot_select_error_air(db, q, trainq=None, path="", ntrain=1000, step=25, output=None): assert db == 'tpch10' or db == 'airtraffic' blacklist = Utils.init_blacklist("config/mal_blacklist.txt") col_stats = ColumnStatsD.fromFile('config/{}_stats.txt'.format(db)) if trainq is None: trainq = q e = [] logging.info("Examining Query: {}".format(q)) logging.info("loading training set...") trainf = "traces/random_{db}/ran_q{q}_n{n}_{db}.json".format(db=db, q=trainq, n=ntrain) traind = MalDictionary.fromJsonFile(trainf, blacklist, col_stats) logging.info("loading test set...") testf = "traces/{}/{}.json".format(db, q) testd = MalDictionary.fromJsonFile(testf, blacklist, col_stats) # filter only select instructions seld = testd.filter(lambda ins: ins.fname in ['select', 'thetaselect']) seli = seld.getInsList() train_tags = traind.query_tags train_tags.sort() e = [] ind = [] # kutsurak: This loop increases the queries we use to train the # model. for i in range(1, ntrain + 2, step): d12 = traind.filter(lambda ins: ins.tag in train_tags[0:i]) print(len(d12.query_tags)) pG = testd.buildApproxGraph(d12) error = 0 for ins in seli: p = ins.predict(d12, pG)[0] cnt = ins.ret_size pc = p.getMem() # we use abs so that the errors do not cancel out if cnt > 0: error += 100 * abs((pc - cnt) / cnt) e.append(error / len(seli)) ind.append(i) print("error array:", e) outpdf = path + '{}_sel{}_error.pdf'.format( db, q) if output is None else output Utils.plotLine(ind, e, outpdf, 'Error perc', 'Nof training queries')
def analyze_max_mem(): blacklist = Utils.init_blacklist("config/mal_blacklist.txt") col_stats = ColumnStatsD.fromFile('config/tpch_sf10_stats.txt') qno = 19 for qno in range(19, 20): logging.info("loading training set...") d1 = MalDictionary.fromJsonFile( "traces/random_tpch_sf10/ran_q{}_n200_tpch10.json".format(qno), blacklist, col_stats) logging.info("loading test set...") d2 = MalDictionary.fromJsonFile("traces/tpch-sf10/{}.json".format(qno), blacklist, col_stats) pG = d2.buildApproxGraph(d1) # sel2 = d2.filter(lambda ins: ins.fname in ['select','thetaselect']) testi = d2.getInsList() testi.sort(key=lambda ins: ins.clk) for ins in testi: pmm = ins.approxMemSize(pG) mm = ins.ret_size if mm > 0 and mm > 10000: err = 100 * abs((pmm - mm) / mm) print(ins.short) print( "query: {}, pred mem: {}, actual mem: {}, error {}".format( qno, pmm, mm, err)) print("cnt: {} pred cnt: {}".format(ins.cnt, ins.predict(d1, pG)[0].avg)) print("")
def train_model(definition): blacklist = Utils.init_blacklist(definition.blacklist_path()) col_stats = Utils.init_blacklist(definition.stats_path()) print('Loading traces for demo... ', end='') sys.stdout.flush() training_set = definition.demo_training_set() dataset_mal = MalDictionary.fromJsonFile(training_set, blacklist, col_stats) print('Done') print('Writing model to disk: {}... '.format( definition.demo_model_storage()), end='') dataset_mal.writeToFile(definition.demo_model_storage()) print('Done') return dataset_mal
def plot_actual_memory(definition): blacklist = Utils.init_blacklist(definition.blacklist_path()) col_stats = ColumnStatsD.fromFile(definition.stats_path()) print('Loading traces...', end='') sys.stdout.flush() data_file = definition.data_file() load_start = datetime.datetime.now() dataset_dict = MalDictionary.fromJsonFile(data_file, blacklist, col_stats) load_end = datetime.datetime.now() print('Done: {}'.format(load_end - load_start)) outfile = definition.result_file() ofl = open(outfile, 'w') print('Computing footprint... ', end='') sys.stdout.flush() result = dict() cnt = 0 total = len(dataset_dict.query_tags) for t in dataset_dict.query_tags: print("\b\b\b\b", end='') print('{:03}%'.format(int(100 * cnt / total)), end='') sys.stdout.flush() cnt += 1 # get the total memory for a specific query tq = dataset_dict.filter(lambda x: x.tag == t) total_mem = tq.getMaxMem() ofl.write("{},{}\n".format(t, total_mem)) print("") ofl.close()
def plot_mem_error_air(db, q, trainq=None, path="", output=None, ntrain=1000, step=25): blacklist = Utils.init_blacklist("config/mal_blacklist.txt") col_stats = ColumnStatsD.fromFile('config/{}_stats.txt'.format(db)) if trainq is None: trainq = q e = [] logging.info("Examining Query: {}".format(q)) logging.info("loading training set...") trainf = "traces/random_{db}/ran_q{q}_n{n}_{db}.json".format(db=db, q=trainq, n=ntrain) traind = MalDictionary.fromJsonFile(trainf, blacklist, col_stats) logging.info("loading test set...") testf = "traces/{}/{}.json".format(db, q) testd = MalDictionary.fromJsonFile(testf, blacklist, col_stats) train_tags = traind.query_tags train_tags.sort() e = [] ind = [] for i in range(1, ntrain + 2, step): d12 = traind.filter(lambda ins: ins.tag in train_tags[0:i]) print(len(d12.query_tags)) pG = testd.buildApproxGraph(d12) pmm = testd.predictMaxMem(pG) mm = testd.getMaxMem() # print(pmm / 1000000, mm / 1000000) e.append(100 * ((pmm - mm) / mm)) ind.append(i) print(e) outf = path + '{}_q{}_memerror.pdf'.format(db, q) if output is None else output Utils.plotLine(ind, e, outf, 'Error perc', 'Nof training queries')
def __init__(self, *def_args, jobj, stats): MalInstruction.__init__(self, *def_args) # this is the column type self.ctype = jobj["arg"][0].get("type", "UNKNOWN") alias_iter = iter([o["alias"] for o in jobj["arg"] if "alias" in o]) # this is the initial column self.col = next(alias_iter, "TMP").split('.')[-1] # if we have a projection argument, this is the projection column self.proj_col = next(alias_iter, "TMP").split('.')[-1] self.arg_size = [o.get("size", 0) for o in jobj.get("arg", [])] self.op = Utils.extract_operator(self.fname, jobj) a1 = self.arg_list[1] self.lead_arg_i = 1 if a1.isVar() and a1.cnt > 0 else 0 self.lead_arg = self.arg_list[self.lead_arg_i] lo, hi = Utils.hi_lo(self.fname, self.op, jobj, stats.get(self.col, ColumnStats(0, 0, 0, 0, 0))) if lo == 'nil': lo = 0 if hi == 'nil': hi = 0 if self.ctype in [ 'bat[:int]', 'bat[:lng]', 'lng', 'bat[:hge]', 'bat[:bte]', 'bat[:sht]' ]: if self.op in ['>=', 'between'] and self.col in stats: s = stats[self.col] step = round((int(s.maxv) - int(s.minv)) / int(s.uniq)) self.lo, self.hi = (int(lo), int(hi) + step) else: # TODO <= self.lo, self.hi = (int(lo), int(hi)) elif self.ctype == 'bat[:date]': self.lo = datetime.strptime(lo, '%Y-%m-%d') self.hi = datetime.strptime(hi, '%Y-%m-%d') else: # logging.error("Wtf type if this ?? :{}".format(self.ctype)) self.hi = hi self.lo = lo
def fromJsonFile(mfile, blacklist, col_stats): """ @des Construct a MalDictionary object from a JSON file @arg mfile : str //json file containing query run @arg blacklist: list<str> //list of blacklisted mal ins @arg col_stats: dict<str,ColumnStats> //column statistics """ if type(mfile) == bytes: open_func = lambda b, mode, encoding: io.StringIO( b.decode(encoding)) elif Utils.is_gzipped(mfile): open_func = gzip.open else: open_func = open with open_func(mfile, mode='rt', encoding='utf-8') as f: maldict = defaultdict(_make_list) startd = {} query_tags = set() lines = f.readlines() for line in lines: jobj = json.loads(line) if jobj is None: break fname, args, ret = Utils.extract_fname(jobj["short"]) if not Utils.is_blacklisted(blacklist, fname): if jobj["state"] == "start": startd[jobj["pc"]] = jobj["clk"] elif jobj["state"] == "done": assert jobj["pc"] in startd new_mals = MalInstruction.fromJsonObj(jobj, col_stats) new_mals.time = int(jobj["clk"]) - int( startd[jobj["pc"]]) new_mals.start = int(startd[jobj["pc"]]) maldict[fname].append(new_mals) query_tags.add(int(jobj["tag"])) return MalDictionary(maldict, list(query_tags), col_stats)
def analyze_select_error_air(db, q, ntrain=1000, step=25): assert db == 'tpch10' or db == 'airtraffic' blacklist = Utils.init_blacklist("config/mal_blacklist.txt") col_stats = ColumnStatsD.fromFile('config/{}_stats.txt'.format(db)) # e = [] logging.info("Examining Query: {}".format(q)) logging.info("loading training set...") trainf = "traces/random_{db}/ran_q{q}_n{n}_{db}.json".format(db=db, q=q, n=ntrain) traind = MalDictionary.fromJsonFile(trainf, blacklist, col_stats) logging.info("loading test set...") testf = "traces/{}/{}.json".format(db, q) testd = MalDictionary.fromJsonFile(testf, blacklist, col_stats) # filter only select instructions seld = testd.filter(lambda ins: ins.fname in ['select', 'thetaselect']) seli = [i for i in seld.getInsList() if i.ret_size > 0] train_tags = traind.query_tags train_tags.sort() # e = [] # ind = [] f = "{:120} realm: {:10.1f} predm: {:10.1f}, argc: {:10.0f} pr_argc {:10.0f}\n" for i in range(1, ntrain, step): d12 = traind.filter(lambda ins: ins.tag in train_tags[0:i + 1]) print(len(d12.query_tags)) pG = testd.buildApproxGraph(d12) error = 0 for ins in seli: p = ins.predict(d12, pG)[0] rs = ins.ret_size pm = p.getMem() rs_mb = rs / 1_000_000 pm_mb = p.getMem() / 1_000_000 print( f.format(ins.short, rs_mb, pm_mb, ins.argCnt(), ins.approxArgCnt(pG))) print("NNi ", p.ins.short) error += 100 * abs((pm - rs) / rs) print("local error == ", 100 * abs((pm - rs) / rs)) print("select error == ", error / len(seli))
def plot_select_error(q): blacklist = Utils.init_blacklist("config/mal_blacklist.txt") col_stats = ColumnStatsD.fromFile('config/tpch_sf10_stats.txt') for qno in q: logging.info("Testing query {}".format(qno)) q = "{}".format(qno) if qno < 10: q = "0{}".format(q) logging.info("loading training set...") d1 = MalDictionary.fromJsonFile( "traces/random_tpch_sf10/ran{}_200_sf10.json".format(qno), blacklist, col_stats) logging.info("loading test set...") d2 = MalDictionary.fromJsonFile("traces/tpch-sf10/{}.json".format(q), blacklist, col_stats) sel2 = d2.filter(lambda ins: ins.fname in ['select', 'thetaselect']) seli = sel2.getInsList() train_tags = d1.query_tags train_tags.sort() e = [] ind = [] for i in [ 1, 5, 10, 15, 20, 25, 30, 40, 50, 75, 100, 125, 150, 175, 200 ]: d12 = d1.filter(lambda ins: ins.tag in train_tags[0:i]) print(len(d12.query_tags)) pG = d2.buildApproxGraph(d12) error = 0 for ins in seli: p = ins.predict(d12, pG)[0] cnt = ins.cnt pc = p.cnt error += 100 * abs((pc - cnt) / cnt) e.append(error / len(seli)) ind.append(i) print(e)
def analyze_mem_error_air(db, q, ntrain=1000, step=25): """ Useful for analyse prediction results """ blacklist = Utils.init_blacklist("config/mal_blacklist.txt") col_stats = ColumnStatsD.fromFile('config/{}_stats.txt'.format(db)) # e = [] logging.info("Examining Query: {}".format(q)) logging.info("loading training set...") trainf = "traces/random_{db}/ran_q{q}_n{n}_{db}.json".format(db=db, q=q, n=ntrain) traind = MalDictionary.fromJsonFile(trainf, blacklist, col_stats) logging.info("loading test set...") testf = "traces/{}/{}.json".format(db, q) testd = MalDictionary.fromJsonFile(testf, blacklist, col_stats) train_tags = traind.query_tags train_tags.sort() # e = [] # ind = [] for i in range(1, ntrain, step): d12 = traind.filter(lambda ins: ins.tag in train_tags[0:i]) print("Number of train queries: ", len(d12.query_tags)) pG = testd.buildApproxGraph(d12) insl = testd.getInsList() insl.sort(key=lambda inst: inst.clk) for ins in insl: p = ins.predict(d12, pG)[0] actual_size_mb = ins.ret_size / 1_000_000 predic_size_mb = p.getMem() / 1_000_000 print("{:120} actual: {:10.1f} pred: {:10.1f}\n".format( ins.short, actual_size_mb, predic_size_mb))
def main(args): blacklist = Utils.init_blacklist(BLACKLIST) col_stats = ColumnStatsD.fromFile(COLSTATS) sys.stdout.flush() dataset = MalDictionary.fromJsonFile( args.input_file, blacklist, col_stats ) tags = sorted(dataset.query_tags) if args.limit: tags = tags[:args.limit] tag_map_file = open(args.tag_map, 'a') if args.tag_map else None counter = 0 for tag in tags: out_name = args.output_files.replace('XXX', '%03d' % counter) short_name = os.path.basename(out_name) if '.' in short_name: short_name = short_name[:short_name.index('.')] if tag_map_file: tag_map_file.write('{}:{}\n'.format(tag, short_name)) counter += 1 contents = dataset.filter(lambda x: x.tag == tag) with open(out_name, 'wb') as f: if out_name.endswith('.lz4'): f = lz4frame.LZ4FrameFile(f, mode='wb') sys.stderr.write('\r[{}/{}] tag {}'.format(counter, len(tags), tag)) pickle.dump(contents, f) f.close() sys.stderr.write('\rDone \n') if tag_map_file: tag_map_file.close()
def fromJsonObj(jobj, stats): size = int(jobj["size"]) pc = int(jobj["pc"]) clk = int(jobj["clk"]) short = jobj["short"] fname, _, _ = Utils.extract_fname(jobj["short"]) tag = int(jobj["tag"]) # rv = [rv.get("size", 0) for rv in jobj["ret"]] ro = jobj.get("ret", []) # return object ret_size = sum([o.get("size", 0) for o in ro if int(o["eol"]) == 0]) arg_size = sum([o.get("size", 0) for o in jobj.get("arg", [])]) arg_list = [Arg.fromJsonObj(e) for e in jobj.get("arg", [])] ret_args = [Arg.fromJsonObj(e) for e in ro] # if e["eol"]==0] # print(len(alive_ret)) free_size = sum([arg.size for arg in arg_list if arg.eol == 1]) arg_vars = [arg.name for arg in arg_list if arg.isVar()] ret_vars = [ret['name'] for ret in ro if Utils.isVar(ret['name'])] count = int(jobj["ret"][0].get("count", 0)) con_args = [ pc, clk, short, fname, size, ret_size, tag, arg_size, arg_list, ret_args, free_size, arg_vars, ret_vars, count ] # Select Instructions if fname in ['select', 'thetaselect', 'likeselect']: return SelectInstruction(*con_args, jobj=jobj, stats=stats) # TODO replace jobj # Projections elif fname in ['projectionpath']: return DirectIntruction(*con_args, base_arg_i=0) elif fname in ['projection', 'projectdelta']: return ProjectInstruction(*con_args) # Joins elif fname in ['join', 'thetajoin', 'crossproduct']: return JoinInstruction(*con_args) # Group Instructions elif fname in ['group', 'subgroup', 'subgroupdone', 'groupdone']: a0 = arg_list[0].col.split('.')[::-1][0] return GroupInstruction(*con_args, base_arg_i=0, base_col=a0, col_stats=stats.get(a0, None)) # Set Instructions: the last parameter determines how to compute the # prediction for this MAL instruction elif fname in ['intersect']: return SetInstruction(*con_args, i1=0, i2=1, fun=min) elif fname in ['mergecand']: return SetInstruction(*con_args, i1=0, i2=1, fun=_lambda_add) elif fname in ['difference']: return SetInstruction(*con_args, i1=0, i2=1, fun=_lambda_lefthand) elif fname in ['<', '>', '>=', '<=']: if arg_list[1].isVar(): return SetInstruction(*con_args, i1=0, i2=1, fun=min) else: return DirectIntruction(*con_args, base_arg_i=0) # Direct Intructions elif fname in ['+', '-', '*', '/', 'or', 'dbl', 'and', 'lng', '%']: if arg_list[0].isVar(): return DirectIntruction(*con_args, base_arg_i=0) elif arg_list[1].isVar(): return DirectIntruction(*con_args, base_arg_i=1) else: return ReduceInstruction(*con_args) elif fname in ['==', 'isnil', '!=', 'like']: return DirectIntruction(*con_args, base_arg_i=0) elif fname in ['sort']: return DirectIntruction(*con_args, base_arg_i=0, base_ret_i=1) elif fname in ['subsum', 'subavg', 'subcount', 'submin']: return DirectIntruction(*con_args, base_arg_i=2) elif fname in ['subslice']: return DirectIntruction(*con_args, base_arg_i=0) elif fname in ['firstn']: argl = len(arg_list) assert argl == 4 or argl == 6 n = int(arg_list[3].aval) if argl == 6 else int(arg_list[1].aval) return DirectIntruction(*con_args, base_arg_i=0, fun=min) elif fname in [ 'hash', 'bulk_rotate_xor_hash', 'identity', 'mirror', 'year', 'ifthenelse', 'delta', 'substring', 'project', 'int', 'floor' ]: return DirectIntruction(*con_args, base_arg_i=0) elif fname in ['dbl']: return DirectIntruction(*con_args, base_arg_i=1) elif fname in ['hge']: if arg_list[1].cnt > 0: return DirectIntruction(*con_args, base_arg_i=1) else: return ReduceInstruction(*con_args) elif fname in ['append']: return DirectIntruction(*con_args, base_arg_i=0, fun=_lambda_inc) elif fname in ['max', 'min']: if len(arg_list) == 1: return ReduceInstruction(*con_args) else: assert len(arg_list) == 2 return DirectIntruction(*con_args, base_arg_i=0) # Aggregate Instructions (result = 1) elif fname in ['sum', 'avg', 'single', 'dec_round']: return ReduceInstruction(*con_args) elif fname in ['new']: return NullInstruction(*con_args) # Load stuff elif fname in ['tid', 'bind', 'bind_idxbat']: return LoadInstruction(*con_args) else: # logging.error("What instruction is this ?? {}".format(fname)) return MalInstruction(*con_args)
def predict(self, traind, approxG, default=None): """ @desc run kNN to find the 5 closest instructions based on the range bounds range extrapolation: (self.hi - self.lo) / (traini.hi - traini.lo) arg extrapolation: self.arg_cnt / traini.arg_cnt prediction(traini) = traini.cnt * range_extrapolation * arg_extrapolation """ assert approxG is not None # First get candidate list by searching for instructions with the same name self_list = traind.mal_dict.get(self.fname, []) # prev_list = [] # # tmp = self # while(tmp.prev_i != None): # prev_list.append(tmp.prev_i) # tmp = tmp.prev_i # prev_list.reverse() # curr_nn = self_list # maxk = 5 * (2 ** len(prev_list)) # for node in prev_list: # k = int(maxk / 2) # curr_level = [ins for ins in curr_nn if node.col == ins.col] # curr_nn = node.kNN(curr_level,k, approxG) # maxk = maxk / 2 # if self.proj_col != 'TMP' and self.prev_i != None: # level1 = [ins for ins in self_list if self.proj_col == ins.col] # logging.error("len level1: {}".format(len(level1))) # logging.error("testing {}".format(self.short)) # prev_nn = self.prev_i.kNN(level1,100, approxG) # cand_list = list([ins.next_i for ins in prev_nn]) # else: # cand_list = self_list # Second: filter candidate list by columns, projection columns and length of arguments cand_list = [ ins for ins in self_list if self.col == ins.col and self.proj_col == ins.proj_col and len(ins.arg_list) == len(self.arg_list) ] # random.shuffle(cand_list) nn = self.kNN(cand_list, 5, approxG) rt = self.ret_args[0].atype # return type # DEBUG if self.fname == 'thetaselect' and self.op == '>': for ins in nn: logging.debug("NN: {} {}".format(ins.cnt, ins.short)) if len(nn) == 0: logging.error("0 knn len in select ?? {} {} {} {}".format( self.short, self.op, self.ctype, self.col)) # logging.error("Cand len {}".format(len(prev_nn))) logging.error("self col: {} proj col {}".format( self.col, self.proj_col)) # for di in [ins for ins in self_list if self.col == ins.col]: # logging.error("cand: {} {} {}".format(di.short, di.col, di.proj_col)) # still just for debugging: keep only one instruction, iso 5 instructions nn.sort(key=lambda ins: self.approxArgDist(ins, approxG)) nn1 = nn[0] arg_cnt = self.approxArgCnt(approxG) if arg_cnt is not None: # do the proper scale up/down according to the proportion avg = sum([ i.extrapolate(self) * (arg_cnt / i.argCnt()) for i in nn if i.argCnt() > 0 ]) / len(nn) cal_avg = min(avg, arg_cnt) # calibration avgm = cal_avg * Utils.sizeof(rt) cnt1 = nn1.extrapolate(self) * arg_cnt / nn1.argCnt( ) if nn1.argCnt() > 0 else nn1.extrapolate(self) return [ Prediction(retv=self.ret_vars[0], ins=nn1, cnt=cnt1, avg=cal_avg, t=rt, mem=avgm) ] else: logging.error("None arguments ??? {}".format(self.lead_arg.name)) avg = sum([i.extrapolate(self) for i in nn]) / len(nn) return [ Prediction(retv=self.ret_vars[0], ins=nn1, cnt=nn1.extrapolate(self), avg=avg, t=rt) ]