Beispiel #1
0
def plot_max_mem_error(q):
    blacklist = Utils.init_blacklist("config/mal_blacklist.txt")

    col_stats = ColumnStatsD.fromFile('config/tpch_sf10_stats.txt')

    for qno in q:
        logging.info("Testing query {}".format(qno))
        q = "{}".format(qno)
        if qno < 10:
            q = "0{}".format(q)

        logging.info("loading training set...")
        d1 = MalDictionary.fromJsonFile(
            "traces/random_tpch_sf10/ran{}_200_sf10.json".format(qno),
            blacklist, col_stats)
        logging.info("loading test set...")
        d2 = MalDictionary.fromJsonFile("traces/tpch-sf10/{}.json".format(q),
                                        blacklist, col_stats)
        train_tags = d1.query_tags
        train_tags.sort()
        e = []
        ind = []
        for i in [
                1, 5, 10, 15, 20, 25, 30, 40, 50, 75, 100, 125, 150, 175, 200
        ]:
            d12 = d1.filter(lambda ins: ins.tag in train_tags[0:i])
            print(len(d12.query_tags))
            pG = d2.buildApproxGraph(d12)
            pmm = d2.predictMaxMem(pG) / 1000000000
            mm = d2.getMaxMem() / 1000000000
            e.append(100 * abs((pmm - mm) / mm))
            ind.append(i)
        print(e)
        Utils.plotBar(ind, e, "results/memf_error_q{}.pdf".format(qno),
                      'nof training queries', 'error perc')
Beispiel #2
0
def plot_allmem_tpch10(path=""):
    blacklist = Utils.init_blacklist("config/mal_blacklist.txt")

    col_stats = ColumnStatsD.fromFile('config/tpch_sf10_stats.txt')

    e = []
    for qno in range(1, 23):
        q = "0{}".format(qno) if qno < 10 else "{}".format(qno)
        logging.info("Examining Query: {}".format(q))
        d1 = MalDictionary.fromJsonFile(
            "traces/random_tpch10/ran_q{}_n200_tpch10.json".format(q),
            blacklist, col_stats)
        d2 = MalDictionary.fromJsonFile("traces/tpch10/{}.json".format(q),
                                        blacklist, col_stats)

        pG = d2.buildApproxGraph(d1)

        pmm = d2.predictMaxMem(pG) / 1_000_000_000
        mm = d2.getMaxMem() / 1_000_000_000

        err = 100 * abs((pmm - mm) / mm)

        print("query: {}, pred mem: {}, actual mem: {}, error {}".format(
            qno, pmm, mm, err))
        e.append(err)
        print(err)
        # TODO: use os.path.join for the following
        outf = path + "mem_error_1-23.pdf"
    Utils.plotBar(range(1, 23), e, outf, 'error perc', 'query no')
Beispiel #3
0
def predict(definition):
    blacklist = Utils.init_blacklist(definition.blacklist_path())
    col_stats = Utils.init_blacklist(definition.stats_path())

    model = load_model(definition)
    plan = definition.demo_plan_file()
    plan_dict = MalDictionary.fromJsonFile(plan, blacklist, col_stats)
Beispiel #4
0
def leave_one_out(definition):
    initial_time = datetime.datetime.now()
    blacklist = Utils.init_blacklist(definition.blacklist_path())
    col_stats = ColumnStatsD.fromFile(definition.stats_path())
    query_num = definition.query_num()

    dataset_dict = None
    # Note: the commented code below does not work because
    # writeToFile, and loadFromFile are broken. When they are fixed
    # this should speed up the whole procedure a bit, because we will
    # not need to parse a big trace file.

    # if os.path.exists(definition['model_file']) and os.path.isfile(definition['model_file']):
    #     try:
    #         dataset_dict = MalDictionary.loadFromFile(definition['model_file'])
    #     except:
    #         logging.warning('Could not load model file: {}. Rebuilding.'.format(definition['model_file']))
    #         dataset_dict = None

    if dataset_dict is None:
        print('Loading traces for query: {:02}...'.format(query_num), end='')
        sys.stdout.flush()
        load_start = datetime.datetime.now()
        dataset_dict = MalDictionary.fromJsonFile(definition.data_file(),
                                                  blacklist, col_stats)
        load_end = datetime.datetime.now()
        print('Done: {}'.format(load_end - load_start))
        # dataset_dict.writeToFile(definition['model_file'])

    errors = list()
    pl = open(definition.result_file(), 'w')
    cnt = 0
    total = len(dataset_dict.query_tags)
    for leaveout_tag in dataset_dict.query_tags:
        iter_start = datetime.datetime.now()
        print("\b\b\b\b", end='')
        print('{:03}%'.format(int(100 * cnt / total)), end='')
        sys.stdout.flush()
        cnt += 1
        test_dict = dataset_dict.filter(lambda x: x.tag == leaveout_tag)
        train_dict = dataset_dict.filter(lambda x: x.tag != leaveout_tag)

        graph = test_dict.buildApproxGraph(train_dict)

        predict_start = datetime.datetime.now()
        predicted_mem = test_dict.predictMaxMem(graph)
        actual_mem = test_dict.getMaxMem()
        iter_end = datetime.datetime.now()

        errors.append(100 * (predicted_mem - actual_mem) / actual_mem)
        pl.write("{} {} {}\n".format(iter_end - iter_start,
                                     iter_end - predict_start,
                                     errors[cnt - 1]))

    print("")
    outfile = definition.out_path('Q{:02}_memerror.pdf'.format(query_num))
    print()
    pl.close()
    Utils.plotLine(numpy.arange(1, cnt), errors, outfile, 'Error percent',
                   'Leave out query')
Beispiel #5
0
def plot_select_error_air(db,
                          q,
                          trainq=None,
                          path="",
                          ntrain=1000,
                          step=25,
                          output=None):
    assert db == 'tpch10' or db == 'airtraffic'
    blacklist = Utils.init_blacklist("config/mal_blacklist.txt")

    col_stats = ColumnStatsD.fromFile('config/{}_stats.txt'.format(db))

    if trainq is None:
        trainq = q

    e = []
    logging.info("Examining Query: {}".format(q))

    logging.info("loading training set...")
    trainf = "traces/random_{db}/ran_q{q}_n{n}_{db}.json".format(db=db,
                                                                 q=trainq,
                                                                 n=ntrain)
    traind = MalDictionary.fromJsonFile(trainf, blacklist, col_stats)

    logging.info("loading test set...")
    testf = "traces/{}/{}.json".format(db, q)
    testd = MalDictionary.fromJsonFile(testf, blacklist, col_stats)

    # filter only select instructions
    seld = testd.filter(lambda ins: ins.fname in ['select', 'thetaselect'])
    seli = seld.getInsList()

    train_tags = traind.query_tags
    train_tags.sort()
    e = []
    ind = []
    # kutsurak: This loop increases the queries we use to train the
    # model.
    for i in range(1, ntrain + 2, step):
        d12 = traind.filter(lambda ins: ins.tag in train_tags[0:i])
        print(len(d12.query_tags))
        pG = testd.buildApproxGraph(d12)
        error = 0
        for ins in seli:
            p = ins.predict(d12, pG)[0]
            cnt = ins.ret_size
            pc = p.getMem()
            # we use abs so that the errors do not cancel out
            if cnt > 0:
                error += 100 * abs((pc - cnt) / cnt)
        e.append(error / len(seli))
        ind.append(i)

    print("error array:", e)
    outpdf = path + '{}_sel{}_error.pdf'.format(
        db, q) if output is None else output
    Utils.plotLine(ind, e, outpdf, 'Error perc', 'Nof training queries')
Beispiel #6
0
def analyze_max_mem():
    blacklist = Utils.init_blacklist("config/mal_blacklist.txt")

    col_stats = ColumnStatsD.fromFile('config/tpch_sf10_stats.txt')

    qno = 19
    for qno in range(19, 20):
        logging.info("loading training set...")
        d1 = MalDictionary.fromJsonFile(
            "traces/random_tpch_sf10/ran_q{}_n200_tpch10.json".format(qno),
            blacklist, col_stats)
        logging.info("loading test set...")
        d2 = MalDictionary.fromJsonFile("traces/tpch-sf10/{}.json".format(qno),
                                        blacklist, col_stats)

        pG = d2.buildApproxGraph(d1)
        # sel2 = d2.filter(lambda ins: ins.fname in ['select','thetaselect'])

        testi = d2.getInsList()
        testi.sort(key=lambda ins: ins.clk)
        for ins in testi:
            pmm = ins.approxMemSize(pG)
            mm = ins.ret_size

            if mm > 0 and mm > 10000:
                err = 100 * abs((pmm - mm) / mm)
                print(ins.short)
                print(
                    "query: {}, pred mem: {}, actual mem: {}, error {}".format(
                        qno, pmm, mm, err))
                print("cnt: {} pred cnt: {}".format(ins.cnt,
                                                    ins.predict(d1,
                                                                pG)[0].avg))
                print("")
Beispiel #7
0
def train_model(definition):
    blacklist = Utils.init_blacklist(definition.blacklist_path())
    col_stats = Utils.init_blacklist(definition.stats_path())
    print('Loading traces for demo... ', end='')
    sys.stdout.flush()
    training_set = definition.demo_training_set()
    dataset_mal = MalDictionary.fromJsonFile(training_set, blacklist,
                                             col_stats)
    print('Done')
    print('Writing model to disk: {}... '.format(
        definition.demo_model_storage()),
          end='')
    dataset_mal.writeToFile(definition.demo_model_storage())
    print('Done')

    return dataset_mal
Beispiel #8
0
def plot_actual_memory(definition):
    blacklist = Utils.init_blacklist(definition.blacklist_path())
    col_stats = ColumnStatsD.fromFile(definition.stats_path())

    print('Loading traces...', end='')
    sys.stdout.flush()
    data_file = definition.data_file()
    load_start = datetime.datetime.now()
    dataset_dict = MalDictionary.fromJsonFile(data_file, blacklist, col_stats)
    load_end = datetime.datetime.now()
    print('Done: {}'.format(load_end - load_start))

    outfile = definition.result_file()
    ofl = open(outfile, 'w')

    print('Computing footprint...     ', end='')
    sys.stdout.flush()
    result = dict()
    cnt = 0
    total = len(dataset_dict.query_tags)
    for t in dataset_dict.query_tags:
        print("\b\b\b\b", end='')
        print('{:03}%'.format(int(100 * cnt / total)), end='')
        sys.stdout.flush()
        cnt += 1
        # get the total memory for a specific query
        tq = dataset_dict.filter(lambda x: x.tag == t)
        total_mem = tq.getMaxMem()
        ofl.write("{},{}\n".format(t, total_mem))

    print("")
    ofl.close()
Beispiel #9
0
def plot_mem_error_air(db,
                       q,
                       trainq=None,
                       path="",
                       output=None,
                       ntrain=1000,
                       step=25):
    blacklist = Utils.init_blacklist("config/mal_blacklist.txt")

    col_stats = ColumnStatsD.fromFile('config/{}_stats.txt'.format(db))

    if trainq is None:
        trainq = q

    e = []
    logging.info("Examining Query: {}".format(q))

    logging.info("loading training set...")
    trainf = "traces/random_{db}/ran_q{q}_n{n}_{db}.json".format(db=db,
                                                                 q=trainq,
                                                                 n=ntrain)
    traind = MalDictionary.fromJsonFile(trainf, blacklist, col_stats)

    logging.info("loading test set...")
    testf = "traces/{}/{}.json".format(db, q)
    testd = MalDictionary.fromJsonFile(testf, blacklist, col_stats)

    train_tags = traind.query_tags
    train_tags.sort()
    e = []
    ind = []
    for i in range(1, ntrain + 2, step):
        d12 = traind.filter(lambda ins: ins.tag in train_tags[0:i])
        print(len(d12.query_tags))
        pG = testd.buildApproxGraph(d12)
        pmm = testd.predictMaxMem(pG)
        mm = testd.getMaxMem()
        # print(pmm / 1000000, mm / 1000000)
        e.append(100 * ((pmm - mm) / mm))
        ind.append(i)

    print(e)
    outf = path + '{}_q{}_memerror.pdf'.format(db,
                                               q) if output is None else output
    Utils.plotLine(ind, e, outf, 'Error perc', 'Nof training queries')
Beispiel #10
0
    def __init__(self, *def_args, jobj, stats):
        MalInstruction.__init__(self, *def_args)
        # this is the column type
        self.ctype = jobj["arg"][0].get("type", "UNKNOWN")
        alias_iter = iter([o["alias"] for o in jobj["arg"] if "alias" in o])
        # this is the initial column
        self.col = next(alias_iter, "TMP").split('.')[-1]
        # if we have a projection argument, this is the projection column
        self.proj_col = next(alias_iter, "TMP").split('.')[-1]
        self.arg_size = [o.get("size", 0) for o in jobj.get("arg", [])]
        self.op = Utils.extract_operator(self.fname, jobj)

        a1 = self.arg_list[1]
        self.lead_arg_i = 1 if a1.isVar() and a1.cnt > 0 else 0
        self.lead_arg = self.arg_list[self.lead_arg_i]

        lo, hi = Utils.hi_lo(self.fname, self.op, jobj,
                             stats.get(self.col, ColumnStats(0, 0, 0, 0, 0)))

        if lo == 'nil':
            lo = 0

        if hi == 'nil':
            hi = 0

        if self.ctype in [
                'bat[:int]', 'bat[:lng]', 'lng', 'bat[:hge]', 'bat[:bte]',
                'bat[:sht]'
        ]:
            if self.op in ['>=', 'between'] and self.col in stats:
                s = stats[self.col]
                step = round((int(s.maxv) - int(s.minv)) / int(s.uniq))
                self.lo, self.hi = (int(lo), int(hi) + step)
            else:  # TODO <=
                self.lo, self.hi = (int(lo), int(hi))
        elif self.ctype == 'bat[:date]':
            self.lo = datetime.strptime(lo, '%Y-%m-%d')
            self.hi = datetime.strptime(hi, '%Y-%m-%d')
        else:
            # logging.error("Wtf type if this ?? :{}".format(self.ctype))
            self.hi = hi
            self.lo = lo
Beispiel #11
0
    def fromJsonFile(mfile, blacklist, col_stats):
        """
        @des Construct a MalDictionary object from a JSON file
        @arg mfile    : str                   //json file containing query run
        @arg blacklist: list<str>             //list of blacklisted mal ins
        @arg col_stats: dict<str,ColumnStats> //column statistics
        """
        if type(mfile) == bytes:
            open_func = lambda b, mode, encoding: io.StringIO(
                b.decode(encoding))
        elif Utils.is_gzipped(mfile):
            open_func = gzip.open
        else:
            open_func = open
        with open_func(mfile, mode='rt', encoding='utf-8') as f:
            maldict = defaultdict(_make_list)
            startd = {}
            query_tags = set()

            lines = f.readlines()
            for line in lines:
                jobj = json.loads(line)
                if jobj is None:
                    break
                fname, args, ret = Utils.extract_fname(jobj["short"])

                if not Utils.is_blacklisted(blacklist, fname):
                    if jobj["state"] == "start":
                        startd[jobj["pc"]] = jobj["clk"]
                    elif jobj["state"] == "done":
                        assert jobj["pc"] in startd
                        new_mals = MalInstruction.fromJsonObj(jobj, col_stats)
                        new_mals.time = int(jobj["clk"]) - int(
                            startd[jobj["pc"]])
                        new_mals.start = int(startd[jobj["pc"]])
                        maldict[fname].append(new_mals)
                        query_tags.add(int(jobj["tag"]))

        return MalDictionary(maldict, list(query_tags), col_stats)
Beispiel #12
0
def analyze_select_error_air(db, q, ntrain=1000, step=25):
    assert db == 'tpch10' or db == 'airtraffic'
    blacklist = Utils.init_blacklist("config/mal_blacklist.txt")

    col_stats = ColumnStatsD.fromFile('config/{}_stats.txt'.format(db))

    # e = []
    logging.info("Examining Query: {}".format(q))

    logging.info("loading training set...")
    trainf = "traces/random_{db}/ran_q{q}_n{n}_{db}.json".format(db=db,
                                                                 q=q,
                                                                 n=ntrain)
    traind = MalDictionary.fromJsonFile(trainf, blacklist, col_stats)

    logging.info("loading test set...")
    testf = "traces/{}/{}.json".format(db, q)
    testd = MalDictionary.fromJsonFile(testf, blacklist, col_stats)

    # filter only select instructions
    seld = testd.filter(lambda ins: ins.fname in ['select', 'thetaselect'])
    seli = [i for i in seld.getInsList() if i.ret_size > 0]

    train_tags = traind.query_tags
    train_tags.sort()
    # e = []
    # ind = []
    f = "{:120} realm: {:10.1f} predm: {:10.1f}, argc: {:10.0f} pr_argc {:10.0f}\n"

    for i in range(1, ntrain, step):
        d12 = traind.filter(lambda ins: ins.tag in train_tags[0:i + 1])
        print(len(d12.query_tags))
        pG = testd.buildApproxGraph(d12)
        error = 0
        for ins in seli:
            p = ins.predict(d12, pG)[0]
            rs = ins.ret_size
            pm = p.getMem()
            rs_mb = rs / 1_000_000
            pm_mb = p.getMem() / 1_000_000
            print(
                f.format(ins.short, rs_mb, pm_mb, ins.argCnt(),
                         ins.approxArgCnt(pG)))
            print("NNi ", p.ins.short)
            error += 100 * abs((pm - rs) / rs)
            print("local error == ", 100 * abs((pm - rs) / rs))
        print("select error == ", error / len(seli))
Beispiel #13
0
def plot_select_error(q):
    blacklist = Utils.init_blacklist("config/mal_blacklist.txt")

    col_stats = ColumnStatsD.fromFile('config/tpch_sf10_stats.txt')

    for qno in q:
        logging.info("Testing query {}".format(qno))
        q = "{}".format(qno)
        if qno < 10:
            q = "0{}".format(q)

        logging.info("loading training set...")
        d1 = MalDictionary.fromJsonFile(
            "traces/random_tpch_sf10/ran{}_200_sf10.json".format(qno),
            blacklist, col_stats)
        logging.info("loading test set...")
        d2 = MalDictionary.fromJsonFile("traces/tpch-sf10/{}.json".format(q),
                                        blacklist, col_stats)
        sel2 = d2.filter(lambda ins: ins.fname in ['select', 'thetaselect'])
        seli = sel2.getInsList()
        train_tags = d1.query_tags
        train_tags.sort()
        e = []
        ind = []
        for i in [
                1, 5, 10, 15, 20, 25, 30, 40, 50, 75, 100, 125, 150, 175, 200
        ]:
            d12 = d1.filter(lambda ins: ins.tag in train_tags[0:i])
            print(len(d12.query_tags))
            pG = d2.buildApproxGraph(d12)
            error = 0
            for ins in seli:
                p = ins.predict(d12, pG)[0]
                cnt = ins.cnt
                pc = p.cnt
                error += 100 * abs((pc - cnt) / cnt)
            e.append(error / len(seli))
            ind.append(i)
        print(e)
Beispiel #14
0
def analyze_mem_error_air(db, q, ntrain=1000, step=25):
    """
    Useful for analyse prediction results
    """
    blacklist = Utils.init_blacklist("config/mal_blacklist.txt")

    col_stats = ColumnStatsD.fromFile('config/{}_stats.txt'.format(db))

    # e = []
    logging.info("Examining Query: {}".format(q))

    logging.info("loading training set...")
    trainf = "traces/random_{db}/ran_q{q}_n{n}_{db}.json".format(db=db,
                                                                 q=q,
                                                                 n=ntrain)
    traind = MalDictionary.fromJsonFile(trainf, blacklist, col_stats)

    logging.info("loading test set...")
    testf = "traces/{}/{}.json".format(db, q)
    testd = MalDictionary.fromJsonFile(testf, blacklist, col_stats)

    train_tags = traind.query_tags
    train_tags.sort()
    # e = []
    # ind = []
    for i in range(1, ntrain, step):
        d12 = traind.filter(lambda ins: ins.tag in train_tags[0:i])
        print("Number of train queries: ", len(d12.query_tags))
        pG = testd.buildApproxGraph(d12)
        insl = testd.getInsList()
        insl.sort(key=lambda inst: inst.clk)
        for ins in insl:
            p = ins.predict(d12, pG)[0]
            actual_size_mb = ins.ret_size / 1_000_000
            predic_size_mb = p.getMem() / 1_000_000
            print("{:120} actual: {:10.1f} pred: {:10.1f}\n".format(
                ins.short, actual_size_mb, predic_size_mb))
Beispiel #15
0
def main(args):
    blacklist = Utils.init_blacklist(BLACKLIST)
    col_stats = ColumnStatsD.fromFile(COLSTATS)

    sys.stdout.flush()
    dataset = MalDictionary.fromJsonFile(
        args.input_file,
        blacklist,
        col_stats
    )
    tags = sorted(dataset.query_tags)
    if args.limit:
        tags = tags[:args.limit]

    tag_map_file = open(args.tag_map, 'a') if args.tag_map else None

    counter = 0
    for tag in tags:
        out_name = args.output_files.replace('XXX', '%03d' % counter)
        short_name = os.path.basename(out_name)
        if '.' in short_name:
            short_name = short_name[:short_name.index('.')]

        if tag_map_file:
            tag_map_file.write('{}:{}\n'.format(tag, short_name))
        counter += 1
        contents = dataset.filter(lambda x: x.tag == tag)
        with open(out_name, 'wb') as f:
            if out_name.endswith('.lz4'):
                f = lz4frame.LZ4FrameFile(f, mode='wb')
            sys.stderr.write('\r[{}/{}] tag {}'.format(counter, len(tags), tag))
            pickle.dump(contents, f)
            f.close()
    sys.stderr.write('\rDone                       \n')
    if tag_map_file:
        tag_map_file.close()
Beispiel #16
0
    def fromJsonObj(jobj, stats):
        size = int(jobj["size"])
        pc = int(jobj["pc"])
        clk = int(jobj["clk"])
        short = jobj["short"]
        fname, _, _ = Utils.extract_fname(jobj["short"])
        tag = int(jobj["tag"])
        # rv = [rv.get("size", 0) for rv in jobj["ret"]]
        ro = jobj.get("ret", [])  # return object
        ret_size = sum([o.get("size", 0) for o in ro if int(o["eol"]) == 0])
        arg_size = sum([o.get("size", 0) for o in jobj.get("arg", [])])
        arg_list = [Arg.fromJsonObj(e) for e in jobj.get("arg", [])]
        ret_args = [Arg.fromJsonObj(e) for e in ro]  # if e["eol"]==0]
        # print(len(alive_ret))
        free_size = sum([arg.size for arg in arg_list if arg.eol == 1])
        arg_vars = [arg.name for arg in arg_list if arg.isVar()]
        ret_vars = [ret['name'] for ret in ro if Utils.isVar(ret['name'])]
        count = int(jobj["ret"][0].get("count", 0))

        con_args = [
            pc, clk, short, fname, size, ret_size, tag, arg_size, arg_list,
            ret_args, free_size, arg_vars, ret_vars, count
        ]

        # Select Instructions
        if fname in ['select', 'thetaselect', 'likeselect']:
            return SelectInstruction(*con_args, jobj=jobj,
                                     stats=stats)  # TODO replace jobj
        # Projections
        elif fname in ['projectionpath']:
            return DirectIntruction(*con_args, base_arg_i=0)
        elif fname in ['projection', 'projectdelta']:
            return ProjectInstruction(*con_args)
        # Joins
        elif fname in ['join', 'thetajoin', 'crossproduct']:
            return JoinInstruction(*con_args)
        # Group Instructions
        elif fname in ['group', 'subgroup', 'subgroupdone', 'groupdone']:
            a0 = arg_list[0].col.split('.')[::-1][0]
            return GroupInstruction(*con_args,
                                    base_arg_i=0,
                                    base_col=a0,
                                    col_stats=stats.get(a0, None))
        # Set Instructions: the last parameter determines how to compute the
        #     prediction for this MAL instruction
        elif fname in ['intersect']:
            return SetInstruction(*con_args, i1=0, i2=1, fun=min)
        elif fname in ['mergecand']:
            return SetInstruction(*con_args, i1=0, i2=1, fun=_lambda_add)
        elif fname in ['difference']:
            return SetInstruction(*con_args, i1=0, i2=1, fun=_lambda_lefthand)
        elif fname in ['<', '>', '>=', '<=']:
            if arg_list[1].isVar():
                return SetInstruction(*con_args, i1=0, i2=1, fun=min)
            else:
                return DirectIntruction(*con_args, base_arg_i=0)
        # Direct Intructions
        elif fname in ['+', '-', '*', '/', 'or', 'dbl', 'and', 'lng', '%']:
            if arg_list[0].isVar():
                return DirectIntruction(*con_args, base_arg_i=0)
            elif arg_list[1].isVar():
                return DirectIntruction(*con_args, base_arg_i=1)
            else:
                return ReduceInstruction(*con_args)
        elif fname in ['==', 'isnil', '!=', 'like']:
            return DirectIntruction(*con_args, base_arg_i=0)
        elif fname in ['sort']:
            return DirectIntruction(*con_args, base_arg_i=0, base_ret_i=1)
        elif fname in ['subsum', 'subavg', 'subcount', 'submin']:
            return DirectIntruction(*con_args, base_arg_i=2)
        elif fname in ['subslice']:
            return DirectIntruction(*con_args, base_arg_i=0)
        elif fname in ['firstn']:
            argl = len(arg_list)
            assert argl == 4 or argl == 6
            n = int(arg_list[3].aval) if argl == 6 else int(arg_list[1].aval)
            return DirectIntruction(*con_args, base_arg_i=0, fun=min)
        elif fname in [
                'hash', 'bulk_rotate_xor_hash', 'identity', 'mirror', 'year',
                'ifthenelse', 'delta', 'substring', 'project', 'int', 'floor'
        ]:
            return DirectIntruction(*con_args, base_arg_i=0)
        elif fname in ['dbl']:
            return DirectIntruction(*con_args, base_arg_i=1)
        elif fname in ['hge']:
            if arg_list[1].cnt > 0:
                return DirectIntruction(*con_args, base_arg_i=1)
            else:
                return ReduceInstruction(*con_args)
        elif fname in ['append']:
            return DirectIntruction(*con_args, base_arg_i=0, fun=_lambda_inc)
        elif fname in ['max', 'min']:
            if len(arg_list) == 1:
                return ReduceInstruction(*con_args)
            else:
                assert len(arg_list) == 2
                return DirectIntruction(*con_args, base_arg_i=0)
        # Aggregate Instructions (result = 1)
        elif fname in ['sum', 'avg', 'single', 'dec_round']:
            return ReduceInstruction(*con_args)
        elif fname in ['new']:
            return NullInstruction(*con_args)
        # Load stuff
        elif fname in ['tid', 'bind', 'bind_idxbat']:
            return LoadInstruction(*con_args)
        else:
            # logging.error("What instruction is this ?? {}".format(fname))
            return MalInstruction(*con_args)
Beispiel #17
0
    def predict(self, traind, approxG, default=None):
        """
        @desc run kNN to find the 5 closest instructions based on the range bounds
        range extrapolation:  (self.hi - self.lo) / (traini.hi - traini.lo)
        arg   extrapolation:  self.arg_cnt / traini.arg_cnt
        prediction(traini) = traini.cnt * range_extrapolation * arg_extrapolation
        """
        assert approxG is not None
        # First get candidate list by searching for instructions with the same name
        self_list = traind.mal_dict.get(self.fname, [])

        # prev_list = []
        #
        # tmp = self
        # while(tmp.prev_i != None):
        #     prev_list.append(tmp.prev_i)
        #     tmp = tmp.prev_i

        # prev_list.reverse()
        # curr_nn = self_list
        # maxk = 5 * (2 ** len(prev_list))
        # for node in prev_list:
        #     k = int(maxk / 2)
        #     curr_level = [ins for ins in curr_nn if node.col == ins.col]
        #     curr_nn = node.kNN(curr_level,k, approxG)
        #     maxk = maxk / 2
        # if self.proj_col != 'TMP' and self.prev_i != None:
        #     level1 = [ins for ins in self_list if self.proj_col == ins.col]
        #     logging.error("len level1: {}".format(len(level1)))
        #     logging.error("testing {}".format(self.short))
        #     prev_nn = self.prev_i.kNN(level1,100, approxG)
        #     cand_list = list([ins.next_i for ins in prev_nn])
        # else:
        #     cand_list = self_list

        # Second: filter candidate list by columns, projection columns and length of arguments
        cand_list = [
            ins for ins in self_list
            if self.col == ins.col and self.proj_col == ins.proj_col
            and len(ins.arg_list) == len(self.arg_list)
        ]
        # random.shuffle(cand_list)
        nn = self.kNN(cand_list, 5, approxG)
        rt = self.ret_args[0].atype  # return type

        # DEBUG
        if self.fname == 'thetaselect' and self.op == '>':
            for ins in nn:
                logging.debug("NN: {} {}".format(ins.cnt, ins.short))

        if len(nn) == 0:
            logging.error("0 knn len in select ?? {} {} {} {}".format(
                self.short, self.op, self.ctype, self.col))
            # logging.error("Cand len {}".format(len(prev_nn)))
            logging.error("self col: {} proj col {}".format(
                self.col, self.proj_col))
            # for di in [ins for ins in self_list if self.col == ins.col]:
            # logging.error("cand: {} {} {}".format(di.short, di.col, di.proj_col))

        # still just for debugging: keep only one instruction, iso 5 instructions
        nn.sort(key=lambda ins: self.approxArgDist(ins, approxG))
        nn1 = nn[0]
        arg_cnt = self.approxArgCnt(approxG)

        if arg_cnt is not None:
            # do the proper scale up/down according to the proportion
            avg = sum([
                i.extrapolate(self) * (arg_cnt / i.argCnt())
                for i in nn if i.argCnt() > 0
            ]) / len(nn)
            cal_avg = min(avg, arg_cnt)  # calibration
            avgm = cal_avg * Utils.sizeof(rt)
            cnt1 = nn1.extrapolate(self) * arg_cnt / nn1.argCnt(
            ) if nn1.argCnt() > 0 else nn1.extrapolate(self)
            return [
                Prediction(retv=self.ret_vars[0],
                           ins=nn1,
                           cnt=cnt1,
                           avg=cal_avg,
                           t=rt,
                           mem=avgm)
            ]
        else:
            logging.error("None arguments ??? {}".format(self.lead_arg.name))
            avg = sum([i.extrapolate(self) for i in nn]) / len(nn)
            return [
                Prediction(retv=self.ret_vars[0],
                           ins=nn1,
                           cnt=nn1.extrapolate(self),
                           avg=avg,
                           t=rt)
            ]