Ejemplo n.º 1
0
    def select(self, orig_list, scores, sel_size):
        # when reverse==False, select variants with lower score, otherwise select higher scores.
        sorted_indices = [
            i[0] for i in sorted(
                enumerate(scores), key=lambda x: x[1], reverse=True)
        ]

        ret = []
        self.vid_from_trace = []

        for i in sorted_indices[:sel_size]:
            if scores[i] == LOW_SCORE:
                if len(self.remaining_traces_id) > 0:
                    # TODO: need to label these, not to mutate in next generation.
                    self.vid_from_trace.append(i)
                    tid_picked = random.choice(self.remaining_traces_id)
                    self.remaining_traces_id.remove(tid_picked)
                    self.logger.info(
                        "Ignored a variant with low score, generating from existing trace %d"
                        % tid_picked)
                    trace = self.traces[tid_picked]
                    new_variant = Trace.generate_variant_from_trace(
                        self.seed_root, trace, self.ext_genome)
                    ret.append(new_variant)

                elif self.generation == 1:
                    self.logger.info(
                        "Ignored a variant with low score, replace with original seed."
                    )
                    ret.append(deepcopy(self.seed_root))
                else:
                    choice = random.choice(
                        ['seed', 'last_gen_best', 'historic_best'])
                    if choice == "seed":
                        self.logger.info(
                            "Ignored a variant with low score, replace with original seed."
                        )
                        ret.append(deepcopy(self.seed_root))
                    elif choice == "last_gen_best":
                        best_gen, best_vid, best_score = self.get_best_variant(
                            self.generation - 1, self.generation - 1)
                        best_root = self.load_variant(best_gen, best_vid)
                        ret.append(best_root)
                        self.logger.info(
                            "Ignored a variant with low score, replace with best variant in last generation[%d, %d]."
                            % (best_gen, best_vid))
                    elif choice == "historic_best":
                        best_gen, best_vid, best_score = self.get_best_variant(
                            1, self.generation - 1)
                        best_root = self.load_variant(best_gen, best_vid)
                        ret.append(best_root)
                        self.logger.info(
                            "Ignored a variant with low score, replace with best variant in historic generation[%d, %d]."
                            % (best_gen, best_vid))
            else:
                self.logger.info("Selected a file with score %.2f" % scores[i])
                ret.append(orig_list[i])

        return ret
Ejemplo n.º 2
0
def main(args):
    mal_sha1 = os.path.basename(args.mal).split('.')[0]
    # load malicious pdf file.
    mal_obj = PdfGenome.load_genome(args.mal, noxref=True)
    # load benign pdf file.
    ben_obj = PdfGenome.load_genome(args.ben, noxref=True)

    newpdf = deepcopy(ben_obj)
    # get exploit path from the malicious pdf file.
    exploit_spec = pickle.load(open(args.exploit_spec, 'rb'))
    epaths = exploit_spec[mal_sha1]

    all_ben_paths = PdfGenome.get_object_paths(ben_obj, set())

    # inject each path from exploit paths
    for path in epaths:
        src_path = None
        # what is the object from path? get insertable path.
        for j in xrange(1, len(path)):
            if path[:-j] in all_ben_paths:
                src_path = path[:-j]
                break
        if src_path is None:
            src_path = ['/Root']
        if j > 1:
            tgt_path = path[:-j + 1]
        else:
            tgt_path = path
        PdfGenome.insert_under(newpdf, src_path, mal_obj, tgt_path)

    outname = '%s/%s_%s' % (args.var_dir, mal_sha1, os.path.basename(args.ben))
    PdfGenome.save_to_file(newpdf, outname)
Ejemplo n.º 3
0
    def select(self, orig_list, scores, sel_size):
        # when reverse==False, select variants with lower score, otherwise select higher scores.
        sorted_indices = [i[0] for i in sorted(enumerate(scores), key=lambda x:x[1], reverse=True)]
        
        ret = []
        self.vid_from_trace = []
        
        for i in sorted_indices[:sel_size]:
            if scores[i] == LOW_SCORE:
                if len(self.remaining_traces_id) > 0:
                    # TODO: need to label these, not to mutate in next generation.
                    self.vid_from_trace.append(i)
                    tid_picked = random.choice(self.remaining_traces_id)
                    self.remaining_traces_id.remove(tid_picked)
                    self.logger.info("Ignored a variant with low score, generating from existing trace %d" % tid_picked)
                    trace = self.traces[tid_picked]
                    new_variant = Trace.generate_variant_from_trace(self.seed_root, trace, self.ext_genome)
                    ret.append(new_variant)

                elif self.generation == 1:
                    self.logger.info("Ignored a variant with low score, replace with original seed.")
                    ret.append(deepcopy(self.seed_root))
                else:
                    choice = random.choice(['seed', 'last_gen_best', 'historic_best'])
                    if choice == "seed":
                        self.logger.info("Ignored a variant with low score, replace with original seed.")
                        ret.append(deepcopy(self.seed_root))
                    elif choice == "last_gen_best":
                        best_gen, best_vid, best_score = self.get_best_variant(self.generation-1, self.generation-1)
                        best_root =  self.load_variant(best_gen, best_vid)
                        ret.append(best_root)
                        self.logger.info("Ignored a variant with low score, replace with best variant in last generation[%d, %d]." % (best_gen, best_vid))
                    elif choice == "historic_best":
                        best_gen, best_vid, best_score = self.get_best_variant(1, self.generation-1)
                        best_root =  self.load_variant(best_gen, best_vid)
                        ret.append(best_root)
                        self.logger.info("Ignored a variant with low score, replace with best variant in historic generation[%d, %d]." % (best_gen, best_vid))
            else:
                self.logger.info("Selected a file with score %.2f" % scores[i])
                ret.append(orig_list[i])
        
        return ret
Ejemplo n.º 4
0
    def initial_population(self):
        logger = self.logger
        logger.info("Getting initial population from existing mutation traces (success: %d, promising: %d)." \
                    % (len(self.success_traces), len(self.promising_traces)))
        popul = []

        traces = self.success_traces + self.promising_traces
        traces = Trace.get_distinct_traces(traces)
        logger.info("Got %d distinct traces" % len(traces))
        self.traces = traces

        self.remaining_traces_id = range(len(traces))

        if 0 < len(self.remaining_traces_id) <= self.pop_size:
            tid_picked = [stuff for stuff in self.remaining_traces_id]
        elif len(self.remaining_traces_id) > self.pop_size:
            tid_picked = random.sample(self.remaining_traces_id, self.pop_size)
            tid_picked.sort()
        else:
            tid_picked = []

        # generate_variants_from_traces
        for i in tid_picked:
            self.remaining_traces_id.remove(i)
            logger.debug("Generating %d variant from existing trace." % i)
            trace = traces[i]
            variant_root = Trace.generate_variant_from_trace(
                self.seed_root, trace, self.ext_genome)
            popul.append(variant_root)

        if len(popul) < int(self.pop_size):
            logger.info("Getting %d more variants in initial population by random mutation." \
                        % (int(self.pop_size) - len(popul)))

        while len(popul) < int(self.pop_size):
            i = len(popul)
            logger.debug("Getting variant %d in initial population." % i)
            root = deepcopy(self.seed_root)
            root = PdfGenome.mutation(self.ext_trie, root, self.mut_rate,
                                      self.ext_genome)
            popul.append(root)
        return popul
Ejemplo n.º 5
0
    def initial_population(self):
        logger = self.logger
        logger.info("Getting initial population from existing mutation traces (success: %d, promising: %d)." \
                    % (len(self.success_traces), len(self.promising_traces)))
        popul = []

        traces = self.success_traces + self.promising_traces
        traces = Trace.get_distinct_traces(traces)
        logger.info("Got %d distinct traces" % len(traces))
        self.traces = traces

        self.remaining_traces_id = range(len(traces))

        if 0 < len(self.remaining_traces_id) <= self.pop_size:
            tid_picked = self.remaining_traces_id
        elif len(self.remaining_traces_id) > self.pop_size:
            tid_picked = random.sample(self.remaining_traces_id, self.pop_size)
            tid_picked.sort()
        else:
            tid_picked = []

        # generate_variants_from_traces
        for i in tid_picked:
            self.remaining_traces_id.remove(i)
            logger.debug("Generating %d variant from existing trace." % i)
            trace = traces[i]
            variant_root = Trace.generate_variant_from_trace(self.seed_root, trace, self.ext_genome)
            popul.append(variant_root)

        if len(popul) < int(self.pop_size):
            logger.info("Getting %d more variants in initial population by random mutation." \
                        % (int(self.pop_size) - len(popul)))

        while len(popul) < int(self.pop_size):
            i = len(popul)
            logger.debug("Getting variant %d in initial population." % i)
            root = deepcopy(self.seed_root)
            root = PdfGenome.mutation(root, self.mut_rate, self.ext_genome)
            popul.append(root)
        return popul
def get_cf(file_name):
    """
	Get conserved features for a given PDF file.
	"""

    # We evaluate each variant with n_test times.
    n_test = 5
    seed_file_path = 'samples/seeds/' + file_name
    pdf_folder = 'samples/tmp_pdfs/' + file_name + '/'
    os.system('mkdir -p %s' % (pdf_folder))
    seed_root = PdfGenome.load_genome(seed_file_path)
    root = deepcopy(seed_root)
    visited_paths = set()
    remaining_paths = list()
    remaining_paths = PdfGenome.get_object_paths(root, visited_paths)
    obj_paths = PdfGenome.get_object_paths(root, visited_paths)
    path_len = len(PdfGenome.get_object_paths(root, visited_paths))
    print('Initial paths:', remaining_paths)
    print path_len

    # Auxilliary list with ASCII order
    aux = []
    for i in range(0, path_len):
        aux.append(str(i))
    aux.sort()

    # Sequentially delete structural paths
    i = 0
    for j in range(0, path_len):
        root = deepcopy(seed_root)
        op_obj_path = remaining_paths.pop(0)
        PdfGenome.delete(root, op_obj_path)
        #print "####################################################"
        #print i, ".pdf: delete", op_obj_path
        #save_path = '/home/liangtong/Desktop/tmp_pdfs/%d.pdf' % (i)

        save_path = pdf_folder + str(i) + '.pdf'
        y = PdfWriter()
        y.write(save_path, root)
        i += 1

    # Evaluate the maliciousness of the variants
    fpaths = list_file_paths(pdf_folder)
    n_mal = [0] * len(fpaths)
    for i in range(0, n_test):
        results = cuckoo(fpaths)
        for j in range(0, len(results)):
            if results[j] != '[]':
                n_mal[j] += 1

    # If the PDF becomes benign after being deleted with a structural pth,
    # then this one should be one of its conserved features.
    paths = []
    for i in range(0, len(n_mal)):
        if n_mal[i] == 0:
            print i
            path = get_path(obj_paths[int(aux[i])])
            if path in feat_list:
                paths.append(get_feat_seq(path, feature_list))

    paths = set(paths)
    paths = list(paths)
    paths.sort()
    print file_name, paths
Ejemplo n.º 7
0
def get_cr():
    n_test = 1
    # STEP 1. Load the external benign pdf file
    ext_file_name = 'ir01-108.pdf'
    ext_path = '/home/liangtong/pdf_files/benign/' + ext_file_name
    ext_root = PdfGenome.load_genome(ext_path)
    ext_obj = PdfGenome.get_object_paths(ext_root, set())

    # STEP 2. Load the malicious pdf file
    mal_file_name = '001d92fc29146e01e0ffa619e5dbf23067f1e814'
    #mal_file_name = '00aaa01030cb7254a0ba30e9e62516f8690b9e3b'
    #mal_file_name = 'kdd04.pdf'
    mal_path = '/home/liangtong/EvadeML-master/samples/seeds/' + mal_file_name
    #mal_path = '/home/liangtong/Desktop/cr-test/'+mal_file_name
    mal_pdf_folder = '/home/liangtong/Desktop/tmp_pdfs/'
    mal_root = PdfGenome.load_genome(mal_path)

    tmp_root = deepcopy(mal_root)

    mal_obj = PdfGenome.get_object_paths(tmp_root, set())
    n_mal_obj = len(mal_obj)
    #os.system('mkdir -p %s' % (mal_pdf_folder))
    print 'Paths in the malicious PDF'
    for i in range(0, n_mal_obj):
        print i, mal_obj[i]

    #print 'Paths in the benign PDF'
    #for i in range(0, len(ext_obj)):
    #	print i, ext_obj[i]

    # STEP 3. Prepare the synthetic PDF
    syn_root = deepcopy(mal_root)
    print 'Target and source paths'
    #print mal_obj[47]
    #print ext_obj[69]
    print mal_obj[19]
    PdfGenome.delete(syn_root, mal_obj[19])
    #PdfGenome.swap(syn_root, mal_obj[47], ext_root, ext_obj[69])
    #PdfGenome.insert(syn)

    syn_obj = PdfGenome.get_object_paths(syn_root, set())
    n_syn_obj = len(syn_obj)
    print 'Paths in the synthetic file'
    for i in range(0, n_syn_obj):
        print i, syn_obj[i]

    #parent, key = PdfGenome.get_parent_key(mal_root, mal_obj[11])
    #print "The key: "
    #print key
    #print "The parent: "
    #print parent.keys()
    #print mal_root.keys()

    # STEP 4. Store the synthetic PDF
    save_path = mal_pdf_folder + 'test.pdf'
    y = PdfWriter()
    #y.write(save_path, syn_root)
    y.write(save_path, syn_root)

    # STEP 6. Test malicious behaviors with sandbox
    '''
	fpaths = list_file_paths(mal_pdf_folder)
	n_mal = [0]*len(fpaths)
	for i in range(0, n_test):
		results = cuckoo(fpaths)
		for j in range(0, len(results)):
			if results[j] != '[]':
				n_mal[j] += 1
	'''
    '''
def generate_pdf(src_entry, sha1, ins_indices, del_indices, model_name):
    global genome_dict
    global idx_to_path
    # deep copy
    newpdf = deepcopy(src_entry)

    ### INSERTION
    for index in ins_indices:
        # find the newobj
        try:
            train_f, fullpaths = genome_dict[index]
        except KeyError:
            continue
        fname = '../data/traintest_all_500test/train_benign/%s' % train_f
        try:
            tgt_entry = PdfGenome.load_genome(fname, noxref = True)
        except pdfrw.errors.PdfParseError:
            tgt_entry = PdfGenome.load_genome(fname, noxref = False)

        # do deterministic
        tgt_path = ['/'+item for item in ('/Root' + fullpaths[0]).split('/')[1:]]
        #tgt_parent, tgt_key = PdfGenome.get_parent_key(tgt_entry, tgt_path)

        # find the longest prefix that exists in src_entry
        #parent = newpdf
        #for i in range(len(tgt_path)-1, 0, -1):
        #    key = tgt_path[:i]
        src_parent = newpdf
        i = 0
        for key in tgt_path[:-1]:
            try:
                src_parent = src_parent[key]
                i += 1
            except (KeyError, TypeError):
                #print tgt_path
                #print sha1
                #print index
                #print cur_iter
                #raise SystemExit
                break
        #key = tgt_path[i-1:i]
        # last parent should work
        src_key = tgt_path[:i]
        if src_key != ['/Root']:
            tgt_key = tgt_path[:i]
            #print src_key
            #print tgt_key
            try:
                PdfGenome.insert(newpdf, src_key, tgt_entry, tgt_key)
            except Exception:
                pass
        else:
            tgt_key = tgt_path[:i+1]
            #print src_key
            #print tgt_key
            # do a insert_under
            PdfGenome.insert_under(newpdf, src_key, tgt_entry, tgt_key)

    ### DELETION
    # for each compact path, I need a set of original paths from the PDF. then I need to delete all of them.
    # get the compact path to path mapping
    compact_to_full = defaultdict(list)
    paths = PdfGenome.get_object_paths(src_entry)
    for ext_id in range(len(paths)):
        fullpath = paths[ext_id]
        fullkey = ''.join([item for item in fullpath[1:] if type(item) != int])
        # IMPORTANT: make this path compact
        key = compact(fullkey[1:])
        compact_to_full[key].append(fullpath)

    # TODO: remove debug
    print compact_to_full

    for index in del_indices:
        compactpath = idx_to_path[index]
        for path in compact_to_full[compactpath]:
            # TODO: remove debug
            print 'delete:', path
            # delete the full path
            try:
                PdfGenome.delete(newpdf, path)
            except Exception:
                # the parent may already be deleted
                continue


    file_dir = 'unrestricted/%s' % model_name
    if not os.path.exists(file_dir):
        os.makedirs(file_dir)
    pdf_path = '%s/%s.pdf' % (file_dir, sha1)
    PdfGenome.save_to_file(newpdf, pdf_path)
    return newpdf, pdf_path
Ejemplo n.º 9
0
    def run(self):
        self.logger.info("Start a gp task with %s" % (self.gp_params))

        score_file_name = os.path.join(self.job_dir, "fitness_scores.pickle")
        self.fitness_scores = {}
        self.hc_count = 0

        max_hc_count = 5

        while True:
            if self.hc_count > max_hc_count:
                self.logger.info("Giving up!")
                break

            self.generation = 0
            self.popul = []
            # Generate traces

            self.traces = []
            self.bin_search_range = []

            self.logger.info("HC step %d" % self.hc_count)

            generation_count = 0
            while len(self.traces) < self.pop_size:
                possible_new_mutation = PdfGenome.mutation(
                    deepcopy(self.seed_root),
                    self.mut_rate,
                    self.ext_genome,
                    max_mut=2**self.max_gen)
                path = self.save_trace_generation(possible_new_mutation,
                                                  "%d.pdf" % generation_count)
                score = self.fitness([path])[0]
                if not score:
                    self.popul.append(possible_new_mutation)
                    self.traces.append(possible_new_mutation.active_trace)
                    self.bin_search_range.append(
                        [0, len(possible_new_mutation.active_trace)])
                    test_pdf = Trace.generate_variant_from_trace(
                        deepcopy(self.seed_root),
                        possible_new_mutation.active_trace, self.ext_genome)
                    self.save_trace_generation(
                        test_pdf, "%d_test.pdf" % generation_count)
                generation_count += 1
            self.save_variants_to_files()

            while self.generation < self.max_gen:
                self.generation += 1
                # Generate samples
                ends = []
                for i in range(self.pop_size):
                    end = (self.bin_search_range[i][0] +
                           self.bin_search_range[i][1]) / 2
                    self.popul[i] = Trace.generate_variant_from_trace(
                        deepcopy(self.seed_root), self.traces[i][:end],
                        self.ext_genome)
                    ends.append(end)

                file_paths = self.save_variants_to_files()
                scores = self.fitness(file_paths)
                self.fitness_scores[self.generation] = scores
                pickle.dump(self.fitness_scores, open(score_file_name, 'wb'))
                self.logger.info("Fitness scores at generation %d: %s" %
                                 (self.generation, scores))

                for i in range(self.pop_size):
                    if scores[i]:
                        self.bin_search_range[i][0] = ends[i]
                    else:
                        self.bin_search_range[i][1] = ends[i]

            self.generation += 1
            for i in range(self.pop_size):
                end = self.bin_search_range[i][1]
                self.popul[i] = Trace.generate_variant_from_trace(
                    deepcopy(self.seed_root), self.traces[i][:end],
                    self.ext_genome)
            file_paths = self.save_variants_to_files()
            scores = self.fitness(file_paths)
            sandbox_result = self.sandbox_func(file_paths)
            flag = False
            for i in range(self.pop_size):
                if sandbox_result[i] and not scores[i]:
                    self.logger.info("Variant %d at gen %d passed!" %
                                     (i, self.generation))
                    flag = True
                    break
            if flag:
                break

            # nothing passed, bin search on sandbox
            classifier_ends = [
                self.bin_search_range[i][1] for i in range(self.pop_size)
            ]

            self.bin_search_range = [[0, len(self.traces[i])]
                                     for i in range(self.pop_size)]
            while self.generation < 2 * self.max_gen:
                self.generation += 1
                # Generate samples
                ends = []
                for i in range(self.pop_size):
                    end = (self.bin_search_range[i][0] +
                           self.bin_search_range[i][1]) / 2
                    self.popul[i] = Trace.generate_variant_from_trace(
                        deepcopy(self.seed_root), self.traces[i][:end],
                        self.ext_genome)
                    ends.append(end)

                file_paths = self.save_variants_to_files()
                scores = self.sandbox_func(file_paths)
                self.fitness_scores[self.generation] = scores
                pickle.dump(self.fitness_scores, open(score_file_name, 'wb'))
                self.logger.info("Sandbox at generation %d: %s" %
                                 (self.generation, scores))

                for i in range(self.pop_size):
                    if scores[i]:
                        self.bin_search_range[i][0] = ends[i]
                    else:
                        self.bin_search_range[i][1] = ends[i]

            sandbox_ends = [
                self.bin_search_range[i][0] for i in range(self.pop_size)
            ]

            min_i = -1
            min_distance = 99999999
            for i in range(self.pop_size):
                this_distance = classifier_ends[i] - sandbox_ends[i]
                self.logger.info("Difference of %d: %d" % (i, this_distance))
                if this_distance < min_distance:
                    min_distance = this_distance
                    min_i = i
                if this_distance < 0:
                    self.logger.critical("========== Take a look at %d!" % i)
            self.logger.info("Picked variant %d at generation %d" %
                             (min_i, self.generation))

            self.hc_count += 1
            new_root = Trace.generate_variant_from_trace(
                deepcopy(self.seed_root),
                self.traces[min_i][:int(sandbox_ends[min_i] * hc_step)],
                self.ext_genome)
            new_root_path = self.save_trace_generation(new_root, 'root.pdf')
            self.seed_root = PdfGenome.load_genome(new_root_path)
        return True
Ejemplo n.º 10
0
    def run(self):
        self.logger.info("Start a gp task with %s" % (self.gp_params))

        score_file_name = os.path.join(self.job_dir, "fitness_scores.pickle")
        self.fitness_scores = {}

        self.popul = self.initial_population()
        self.generation = 1

        while self.generation <= self.max_gen:
            self.logger.info(
                "There're %d variants in population at generation %d." %
                (len(self.popul), self.generation))

            file_paths = self.save_variants_to_files()

            scores = self.fitness(file_paths, self.seed_sha1)
            # Introduce a fake score for testing tracing.
            # scores = [0.1, 0.2] * (self.pop_size/2)

            self.fitness_scores[self.generation] = scores
            pickle.dump(self.fitness_scores, open(score_file_name, 'wb'))

            self.logger.info("Fitness scores: %s" % scores)
            self.logger.info("Sorted fitness: %s" %
                             sorted(scores, reverse=True))

            if max(scores) > self.fitness_threshold:
                self.best_score = max(scores)
                self.logger.info(
                    "Already got a high score [%.2f]>%.2f variant, break the GP process."
                    % (max(scores), self.fitness_threshold))

                # Store the success traces.
                for i in range(len(scores)):
                    score = scores[i]
                    if score > self.fitness_threshold:
                        success_trace = self.popul[i].active_trace
                        self.success_traces.append(success_trace)

                # Dump the new generated traces.
                # We assume no concurrent GP tasks depending on the traces.
                Trace.dump_traces(self.success_traces,
                                  self.success_traces_path)
                touch(os.path.join(self.job_dir, finished_flag))
                break
            elif self.generation == max_gen:
                self.logger.info("Failed at max generation.")
                if max(scores) >= self.seed_fitness:
                    # k can be a parameter
                    best_k_gen, best_k_vid, best_k_scores = self.get_best_k_variant(
                        4, 1, self.generation)
                    self.best_score = best_k_scores[0]
                    for i in range(len(best_k_scores)):
                        best_gen = best_k_gen[i]
                        best_vid = best_k_vid[i]
                        this_score = best_k_scores[i]
                        promising_trace = self.load_variant_trace(
                            best_gen, best_vid)
                        self.logger.info(
                            "Save the promising trace %.2f of %d:%d" %
                            (this_score, best_gen, best_vid))
                        if promising_trace not in self.promising_traces:
                            self.promising_traces.append(promising_trace)
                    Trace.dump_traces(self.promising_traces,
                                      self.promising_traces_path,
                                      exclude_traces=self.success_traces)
                break

            # Crossover
            if self.xover_rate > 0:
                self.popul = self.select(self.popul, scores, self.pop_size / 2)
                self.logger.debug(
                    "After selecting goods and replacing bads, we have %d variants in population."
                    % len(self.popul))

                for p1, p2 in zip(self.popul[0::2], self.popul[1::2]):
                    c1, c2 = PdfGenome.crossover(p1, p2)
                    self.popul.append(c1)
                    self.popul.append(c2)
                self.logger.debug(
                    "After crossover, we have %d variants in population." %
                    len(self.popul))
            else:  # No Crossover
                self.popul = self.select(self.popul, scores, self.pop_size)
                self.logger.debug(
                    "After selecting goods and replacing bads, we have %d variants in population."
                    % len(self.popul))

            # Mutation
            for i in range(len(self.popul)):
                if i not in self.vid_from_trace:
                    self.logger.debug("Generating %d:%d variant" %
                                      (self.generation + 1, i))
                    try:
                        self.popul[i] = PdfGenome.mutation(
                            self.popul[i], self.mut_rate, self.ext_genome)
                    except Exception, e:
                        self.logger.debug(
                            "Exception %s, replace with original seed" % e)
                        self.popul[i] = deepcopy(self.seed_root)
                else:
                    self.logger.debug("Keep %d:%d variant from trace." %
                                      (self.generation + 1, i))

            self.generation = self.generation + 1
Ejemplo n.º 11
0
    def select(self, orig_list, scores, sel_size):
        # when reverse==False, select variants with lower score, otherwise select higher scores.
        sorted_indices = [
            i[0] for i in sorted(
                enumerate(scores), key=lambda x: x[1], reverse=True)
        ]

        ret = []
        self.vid_from_trace = []

        replace_size = 0
        for i in sorted_indices[:sel_size]:
            if scores[i] == LOW_SCORE:
                replace_size += 1

            else:
                self.logger.info("Selected a file with score %.2f" % scores[i])
                ret.append(orig_list[i])

        # replace i to sel_size by selecting from historic best, previous generations randomly (?) or distinct scores (?), and then the seed.
        self.logger.info("Need to find %d replacements" % replace_size)
        remain_size = replace_size
        if self.generation != 1:
            if replace_size == 0:
                size_best, size_topk, size_rand = 0, 0, 0
            elif replace_size == 1:
                size_best, size_topk, size_rand = 1, 0, 0
            elif replace_size == 2:
                size_best, size_topk, size_rand = 1, 1, 0
            else:
                size_best, size_topk, size_rand = int(replace_size / 3), int(
                    replace_size / 3), int(replace_size / 3)
                size_topk += replace_size % 3

            # 1/3 goes to the historic best.
            for j in range(size_best):
                best_gen, best_vid, best_score = self.get_best_variant(
                    1, self.generation - 1)
                best_root = self.load_variant(best_gen, best_vid)
                ret.append(best_root)
                self.logger.info(
                    "Ignored a variant with low score, replace with best variant in historic generation[%d, %d]."
                    % (best_gen, best_vid))

            # get best k
            if size_topk != 0:
                k_gen, k_vid, k_scores = self.get_best_k_variant(
                    size_topk, 1, self.generation - 1)
                for j in range(len(k_gen)):
                    this_gen = k_gen[j]
                    this_vid = k_vid[j]
                    if this_gen != 0:
                        new_root = self.load_variant(this_gen, this_vid)
                    else:
                        new_root = deepcopy(self.seed_root)
                    ret.append(new_root)
                    self.logger.info(
                        "Ignored a variant with low score, replace with one of the good variants in historic generation[%d, %d]."
                        % (this_gen, this_vid))
            else:
                k_gen = []

            remain_size = replace_size - size_best - len(k_gen)

            # half of remaining get random k from past 4 gen
            if self.generation - 5 <= 1:
                start_gen = 1
            else:
                start_gen = self.generation - 5
            # sample k that are not LOW_SCORE
            if size_rand != 0:
                k_gen, k_vid, k_scores = self.get_random_k_variant(
                    size_rand, start_gen, self.generation - 1)
                for j in range(len(k_gen)):
                    this_gen = k_gen[j]
                    this_vid = k_vid[j]
                    if this_gen != 0:
                        new_root = self.load_variant(this_gen, this_vid)
                    else:
                        new_root = deepcopy(self.seed_root)
                    ret.append(new_root)
                    self.logger.info(
                        "Ignored a variant with low score, replace with one of the random variant in the last four generation[%d, %d]."
                        % (this_gen, this_vid))
            else:
                k_gen = []

            # update remain_size
            remain_size -= len(k_gen)
        if remain_size > 0:
            for j in range(remain_size):
                self.logger.info(
                    "Ignored a variant with low score, replace with original seed."
                )
                ret.append(deepcopy(self.seed_root))
        return ret