def generate_cfg_pair(pair_list): cfgs_1 = [] cfgs_2 = [] logging.info('generate cfg pair...') for pair in tqdm(pair_list): # logging.debug('pair_1: {}'.format(pair[0])) graph_cfg = nx.read_adjlist(os.path.join(config.FEA_DIR, pair[0] + '_cfg.txt')) # logging.debug('num graph nodes: {}'.format(graph_cfg.nodes())) adj_arr = np.array(nx.convert_matrix.to_numpy_matrix(graph_cfg, dtype=float)) # logging.debug(adj_arr) # logging.debug('adj_arr: {}'.format(list(itertools.chain.from_iterable(adj_arr)))) adj_str = adj_arr.astype(np.string_) # logging.debug('adj_str: {}'.format(list(itertools.chain.from_iterable(adj_str)))) # logging.debug('cfg:{}'.format(b','.join(list(itertools.chain.from_iterable(adj_str))))) cfgs_1.append(b','.join(list(itertools.chain.from_iterable(adj_str)))) graph_cfg = nx.read_adjlist(os.path.join(config.FEA_DIR, pair[1] + '_cfg.txt')) adj_arr = np.array(nx.convert_matrix.to_numpy_matrix(graph_cfg, dtype=float)) adj_str = adj_arr.astype(np.string_) cfgs_2.append(b','.join(list(itertools.chain.from_iterable(adj_str)))) return cfgs_1, cfgs_2
def gen_tfrecord_and_save_i2v_funcsim(save_file, pair_list, label_list): cfgs_1, cfgs_2, dfgs_1, dfgs_2, feas_1, feas_2, nums_1, nums_2, max_size = \ construct_learning_dataset_i2v_funcsim(pair_list) node_list = np.linspace(max_size, max_size, len(label_list), dtype=int) writer = tf.python_io.TFRecordWriter(save_file) logging.info('generate tfrecord and save to {}'.format(save_file)) for item1, item2, item3, item4, item5, item6, item7, item8, item9, item10 \ in tqdm(zip(label_list, cfgs_1, cfgs_2, dfgs_1, dfgs_2, feas_1, \ feas_2, nums_1, nums_2, node_list)): feature = { 'label': tf.train.Feature(int64_list = tf.train.Int64List(value=[item1])), 'cfg_1': tf.train.Feature(bytes_list = tf.train.BytesList(value=[item2])), 'cfg_2': tf.train.Feature(bytes_list = tf.train.BytesList(value=[item3])), 'dfg_1': tf.train.Feature(bytes_list = tf.train.BytesList(value=[item4])), 'dfg_2': tf.train.Feature(bytes_list = tf.train.BytesList(value=[item5])), 'fea_1': tf.train.Feature(bytes_list = tf.train.BytesList(value=[item6])), 'fea_2': tf.train.Feature(bytes_list = tf.train.BytesList(value=[item7])), 'num_1': tf.train.Feature(int64_list = tf.train.Int64List(value=[item8])), 'num_2': tf.train.Feature(int64_list = tf.train.Int64List(value=[item9])), 'max': tf.train.Feature(int64_list = tf.train.Int64List(value=[item10])), } features = tf.train.Features(feature = feature) example_proto = tf.train.Example(features = features) serialized = example_proto.SerializeToString() writer.write(serialized) writer.close()
def generate_feature_pair(pair_list, flag): """ Args: flag: 0: gemini. 1: vulseeker. 2: i2v_***. """ left, right = 1, 8 if flag == 1: left, right = 8,16 elif flag == 2: left, right = 1, config.WORD2VEC_EMBEDDING_SIZE + 1 feas_1 = [] feas_2 = [] nums_1 = [] nums_2 = [] nodes_length = [] logging.info('generate feature pair...') for pair in tqdm(pair_list): num_node_1, fea_1 = __generate_feature_func(pair[0], left, right, flag) feas_1.append(fea_1) nums_1.append(num_node_1) nodes_length.append(num_node_1) num_node_2, fea_2 = __generate_feature_func(pair[1], left, right, flag) feas_2.append(fea_2) nums_2.append(num_node_2) nodes_length.append(num_node_2) return feas_1, feas_2, np.max(nodes_length), np.array(nums_1), np.array(nums_2)
def generate_cfg_dfg_pair(pair_list): cfgs_1 = [] cfgs_2 = [] dfgs_1 = [] dfgs_2 = [] logging.info('generate cfg & dfg pair...') for pair in tqdm(pair_list): cfg_1, dfg_1 = __generate_cfg_dfg_func(pair[0]) cfgs_1.append(cfg_1) dfgs_1.append(dfg_1) cfg_2, dfg_2 = __generate_cfg_dfg_func(pair[1]) cfgs_2.append(cfg_2) dfgs_2.append(dfg_2) return cfgs_1, cfgs_2, dfgs_1, dfgs_2
feature = { 'label': tf.train.Feature(int64_list = tf.train.Int64List(value=[item1])), 'cfg_1': tf.train.Feature(bytes_list = tf.train.BytesList(value=[item2])), 'cfg_2': tf.train.Feature(bytes_list = tf.train.BytesList(value=[item3])), 'dfg_1': tf.train.Feature(bytes_list = tf.train.BytesList(value=[item4])), 'dfg_2': tf.train.Feature(bytes_list = tf.train.BytesList(value=[item5])), 'fea_1': tf.train.Feature(bytes_list = tf.train.BytesList(value=[item6])), 'fea_2': tf.train.Feature(bytes_list = tf.train.BytesList(value=[item7])), 'num_1': tf.train.Feature(int64_list = tf.train.Int64List(value=[item8])), 'num_2': tf.train.Feature(int64_list = tf.train.Int64List(value=[item9])), 'max': tf.train.Feature(int64_list = tf.train.Int64List(value=[item10])), } features = tf.train.Features(feature = feature) example_proto = tf.train.Example(features = features) serialized = example_proto.SerializeToString() writer.write(serialized) writer.close() test_and_create_dirs(config.TFRECORD_TRAIN, \ config.TFRECORD_VALID, \ config.TFRECORD_TEST) train_pair, train_label, valid_pair, valid_label, test_pair, test_label = load_dataset() logging.info('generate tfrecord: i2v_funcsim train...') gen_tfrecord_and_save_i2v_funcsim(config.TFRECORD_TRAIN, train_pair, train_label) logging.info('generate tfrecord: i2v_funcsim valid...') gen_tfrecord_and_save_i2v_funcsim(config.TFRECORD_VALID, valid_pair, valid_label) logging.info('generate tfrecord: i2v_funcsim test...') gen_tfrecord_and_save_i2v_funcsim(config.TFRECORD_TEST, test_pair, test_label)
def calculate_auc(labels, predicts): fpr, tpr, thresholds = roc_curve(labels, predicts, pos_label=1) AUC = auc(fpr, tpr) logging.info("auc : {}".format(AUC)) return AUC
tr_acc = compute_accuracy(predict, y) statis_train_batch_loss.append(loss_value) statis_train_batch_acc.append(tr_acc) avg_loss += loss_value avg_acc += tr_acc epoch_avg_loss += loss_value epoch_avg_acc += tr_acc if (i + 1) % display_step == 0 or i == 0: if i > 0: avg_loss /= display_step avg_acc /= display_step msg = 'iter: {}, avg_loss: {}, avg_acc: {}' logging.info(msg.format(i + 1, avg_loss, avg_acc)) avg_loss = 0 avg_acc = 0 statis_train_epoch_loss.append(epoch_avg_loss / num_iter) statis_train_epoch_acc.append(epoch_avg_acc / num_iter) if epoch % snapshot == 0: total_labels = [] total_predicts = [] avg_loss = 0. avg_acc = 0. valid_start_time = time.time() for i in tqdm(range(int(valid_num / B))): true_batch_valid = sess.run(list(batch_valid)) y, cfgs_1, dfgs_1, cfgs_2, dfgs_2, feature_1, feature_2, v_num_1, v_num_2 \ = get_batch(*true_batch_valid)