Ejemplo n.º 1
0
 def update_per_subtoken_statistics(self, results, true_positive, false_positive, false_negative):
     for original_name, top_words in results:
         prediction = common.filter_impossible_names(top_words)[0]
         original_subtokens = common.get_subtokens(original_name)
         predicted_subtokens = common.get_subtokens(prediction)
         for subtok in predicted_subtokens:
             if subtok in original_subtokens:
                 true_positive += 1
             else:
                 false_positive += 1
         for subtok in original_subtokens:
             if not subtok in predicted_subtokens:
                 false_negative += 1
     return true_positive, false_positive, false_negative
Ejemplo n.º 2
0
 def update_correct_predictions(self, num_correct_predictions, output_file, results):
     for original_name, top_words in results:
         normalized_original_name = common.normalize_word(original_name)
         predicted_something = False
         for i, predicted_word in enumerate(common.filter_impossible_names(top_words)):
             if i == 0:
                 output_file.write('Original: ' + original_name + ', predicted 1st: ' + predicted_word + '\n')
             predicted_something = True
             normalized_suggestion = common.normalize_word(predicted_word)
             if normalized_original_name == normalized_suggestion:
                 output_file.write('\t\t predicted correctly at rank: ' + str(i + 1) + '\n')
                 for j in range(i, self.topk):
                     num_correct_predictions[j] += 1
                 break
         if not predicted_something:
             output_file.write('No results for predicting: ' + original_name)
     return num_correct_predictions
def update_algorithm_dict(results, algorithm_dict):
    for original_name, top_words in results:
        prediction = common.filter_impossible_names(top_words)[0]
        original_subtokens = common.get_subtokens(original_name)
        predicted_subtokens = common.get_subtokens(prediction)
        original_main = original_subtokens[0]
        predicted_main = predicted_subtokens[0]
        # print('original_subtokens: ', original_subtokens)
        # print('predicted_subtokens: ', predicted_subtokens)
        # print('')
        if predicted_main not in algorithm_dict:
            algorithm_dict[predicted_main] = Algorithm(predicted_main, 0, 0)
        else:
            if predicted_main == original_main:
                algorithm_dict[predicted_main].true_positive += 1
            else:
                algorithm_dict[predicted_main].false_negative += 1

    return algorithm_dict
Ejemplo n.º 4
0
    def predict(self):
        input_filename = 'Input.java'
        # MAX_ATTEMPTS = 50
        # MAX_NODES_TO_OPEN = 10

        word_to_indextop, indextop_to_word = self.model.create_ordered_words_dictionary(
            self.model.get_data_dictionaries_path(self.config.LOAD_PATH),
            self.config.MAX_WORDS_FROM_VOCAB_FOR_ADVERSARIAL)

        print('Starting interactive prediction with BFS adversarial search... (depth = {}, topk = {})'
              .format(self.max_depth, self.topk))
        while True:
            print(
                'Modify the file: "%s" and press any key when ready, or "q" / "quit" / "exit" to exit' % input_filename)
            user_input = input()
            if user_input.lower() in self.exit_keywords:
                print('Exiting...')
                return



            with open(input_filename, "r") as f:
                original_code = f.read()

            try:
                predict_lines, hash_to_string_dict = self.path_extractor.extract_paths(input_filename)
            except ValueError as e:
                print(e)
                continue

            var_code_split_index = predict_lines[0].find(" ")
            original_code = predict_lines[0][var_code_split_index + 1:]

            results = self.model.predict([original_code])
            prediction_results = common.parse_results(results, hash_to_string_dict, topk=SHOW_TOP_CONTEXTS)
            for method_prediction in prediction_results:
                print('Original name:\t' + method_prediction.original_name)
                for name_prob_pair in method_prediction.predictions:
                    print('\t(%f) predicted: %s' % (name_prob_pair['probability'], name_prob_pair['name']))

            # generate pca
            self.model.creat_PCA_tokens(predict_lines[0])

            # Search for adversarial examples
            print("select variable to rename OR -- to skip search:")
            var_to_rename = input()
            if var_to_rename == "--":
                continue

            while True:
                print("select attack type: 'nontargeted' for non-targeted attack")
                print("OR target method name for targeted attack (each word is seperated by |)")
                attack_type = input()

                # untargeted searcher
                if attack_type == "nontargeted":
                    print("Using non-targeted attack")
                    searcher = AdversarialSearcher(self.topk, self.max_depth, word_to_indextop, indextop_to_word, predict_lines[0],
                                                   lambda c, v: [(var_to_rename, var_to_rename)])
                    break

                else: # targeted searcher
                    if attack_type in self.model.target_word_to_index:
                        print("Using targeted attack. target:", attack_type)
                        searcher = AdversarialTargetedSearcher(self.topk, self.max_depth, word_to_indextop, indextop_to_word,
                                                              predict_lines[0], attack_type,
                                                              lambda c, v: [(var_to_rename, var_to_rename)])
                        break

                    print(attack_type, "not existed in vocab! try again")

            adversarial_results = []
            # original_prediction = '|'.join(method_prediction.predictions[0]['name'])

            while True:
                batch_nodes_data = [(n, c) for n, c in searcher.pop_unchecked_adversarial_code()]
                batch_data = [c for _, c in batch_nodes_data]
                results = self.model.predict(batch_data, self.guard_input)
                for (node, _), res in zip(batch_nodes_data, results):
                    one_top_words = res[1]
                    one_top_words = common.filter_impossible_names(one_top_words)
                    if not one_top_words:
                        print("code with state: " +
                                          str(node) + " cause empty predictions\n")
                        continue

                    if searcher.is_target_found(one_top_words):
                        adversarial_results.append((one_top_words[0],node))


                if adversarial_results and not self.multiple_results:
                    break

                batch_data = [searcher.get_adversarial_code()]
                batch_word_to_derive = [searcher.get_word_to_derive()]
                loss, all_grads = self.model.calc_loss_and_gradients_wrt_input(batch_data, batch_word_to_derive,
                                                                                            indextop_to_word)
                if not searcher.next((0, "", all_grads[0])):
                    break

            if not results:
                print("FAILD! no replaces found")
            else:
                print("variable replaces:")
                print("Prediction\tnode")
                for r in adversarial_results:
                    print(r[0],"\t",r[1])
    def predict(self):
        input_filename = 'Input.java'
        # MAX_ATTEMPTS = 50
        # MAX_NODES_TO_OPEN = 10

        src_folder = "test_adversarial/src"
        # input_src = ["contains.java", "count.java", "done.java", "escape.java", "factorial.java", "get.java",
        #              "indexOf.java", "isPrime.java", "postRequest.java", "reverseArray.java", "sort.java"]
        input_src = os.listdir(src_folder)
        targets = [
            "sort", "contains", "get", "index|of", "done", "reverse|array",
            "count", "is|prime", "post|request", "escape", "add", "close",
            "main", "max", "min", "factorial", "load", "foo", "update", "bar",
            "exception", "test", "swap", "predict"
        ]

        word_to_indextop, indextop_to_word = self.model.create_ordered_words_dictionary(
            self.model.get_data_dictionaries_path(self.config.LOAD_PATH),
            self.config.MAX_WORDS_FROM_VOCAB_FOR_ADVERSARIAL)

        print(
            'Starting interactive prediction with BFS adversarial search... (depth = {}, topk = {})'
            .format(self.max_depth, self.topk))
        for src in input_src:
            print('SAMPLE: ', src)

            input_filename = src_folder + "/" + src

            try:
                predict_lines, hash_to_string_dict = self.path_extractor.extract_paths(
                    input_filename)
            except ValueError as e:
                print(e)
                continue

            var, original_code = common_adversarial.separate_vars_code(
                predict_lines[0])

            # ignore methods without vars
            if not common_adversarial.get_all_vars(var):
                print("NO VARS. skip.")
                continue

            results = self.model.predict([original_code])
            prediction_results = common.parse_results(results,
                                                      hash_to_string_dict,
                                                      topk=SHOW_TOP_CONTEXTS)
            # skip method that were predicted wrong
            method_prediction = prediction_results[0]
            if method_prediction.original_name.lower() != "".join(
                    method_prediction.predictions[0]['name']):
                print("WRONG PREDICTION. skip. (true: {}, pred: {})".format(
                    method_prediction.original_name,
                    method_prediction.predictions))
                continue
            for method_prediction in prediction_results:
                print('Original name:\t' + method_prediction.original_name)
                for name_prob_pair in method_prediction.predictions:
                    print('\t(%f) predicted: %s' %
                          (name_prob_pair['probability'],
                           name_prob_pair['name']))

            # Search for adversarial examples
            print("ADVERSARIAL results:")

            for target in targets:
                print("TARGET:", target)
                if target != "nontargeted" and target not in self.model.target_word_to_index:
                    print("target not exist. skip.")
                    continue

                for var_to_rename in common_adversarial.get_all_vars(var):
                    # untargeted searcher
                    if target == "nontargeted":
                        searcher = AdversarialSearcher(
                            self.topk, self.max_depth, word_to_indextop,
                            indextop_to_word, predict_lines[0],
                            lambda c, v: [(var_to_rename, var_to_rename)])
                    else:  # targeted searcher
                        searcher = AdversarialTargetedSearcher(
                            self.topk, self.max_depth, word_to_indextop,
                            indextop_to_word, predict_lines[0], target,
                            lambda c, v: [(var_to_rename, var_to_rename)])

                    adversarial_results = []

                    while True:
                        batch_nodes_data = [
                            (n, c) for n, c in
                            searcher.pop_unchecked_adversarial_code()
                        ]
                        batch_data = [c for _, c in batch_nodes_data]
                        results = self.model.predict(batch_data,
                                                     self.guard_input)
                        for (node, _), res in zip(batch_nodes_data, results):
                            one_top_words = res[1]
                            one_top_words = common.filter_impossible_names(
                                one_top_words)
                            if not one_top_words:
                                print("code with state: " + str(node) +
                                      " cause empty predictions\n")
                                continue

                            if searcher.is_target_found(one_top_words):
                                adversarial_results.append(
                                    (one_top_words[0], node))

                        if adversarial_results and not self.multiple_results:
                            break

                        batch_data = [searcher.get_adversarial_code()]
                        batch_word_to_derive = [searcher.get_word_to_derive()]
                        loss, all_grads = self.model.calc_loss_and_gradients_wrt_input(
                            batch_data, batch_word_to_derive, indextop_to_word)
                        if not searcher.next((0, "", all_grads[0])):
                            break

                    for r in adversarial_results:
                        print(r[0], "\t\t\t", r[1])