def cmp_algorithm_cv(base_dir, normal_dir, data_path, output_dir, model_name='', dataset=''): char_wb = False if 'tf' in model_name: tf = True else: tf = False if 'ngram' in model_name: ngram = (2, 15) # char_wb = True else: ngram = None classifier_dir = base_dir + dataset outfile = os.path.join(classifier_dir, model_name + 'cv_res_sel.json') if os.path.exists(outfile): return if os.path.exists(os.path.join(output_dir, model_name + "vec_sel.pkl")): X = Learner.obj_from_file(os.path.join(output_dir, model_name + "X_sel.pkl")) y = Learner.obj_from_file(os.path.join(output_dir, model_name + "y_sel.pkl")) else: instances, y = Learner.gen_instances(os.path.join(normal_dir, 'March'), data_path, char_wb=char_wb, simulate=False) X, feature_names, vec = Learner.gen_X_matrix(instances, tf=tf, ngrams_range=ngram) Learner.save2file(X, os.path.join(output_dir, model_name + "X.pkl")) Learner.save2file(y, os.path.join(output_dir, model_name + "y.pkl")) Learner.save2file(vec, os.path.join(output_dir, model_name + "vec.pkl")) Learner.save2file(feature_names, os.path.join(output_dir, model_name + "feature_names.pkl")) X, feature_names, vec = Learner.feature_selection(X, y, 500, vec, instances, tf=tf, ngram_range=ngram) Learner.save2file(X, os.path.join(output_dir, model_name + "X_sel.pkl")) Learner.save2file(y, os.path.join(output_dir, model_name + "y_sel.pkl")) Learner.save2file(vec, os.path.join(output_dir, model_name + "vec_sel.pkl")) Learner.save2file(feature_names, os.path.join(output_dir, model_name + "feature_names_sel.pkl")) CtuCCAnalyzer.train_and_save(X, y, model_name, classifier_dir)
def feature_tab(base_dir): # Open X and output the attribute amount for model_name in ['bag', 'bag-ngram', 'tf', 'tf-ngram']: if model_name == 'bag': model_n = 'Bag-of-word' elif model_name == 'tf': model_n = 'Tf-idf' elif model_name == 'bag-ngram': model_n = 'Bag-of-word-NGram' else: model_n = 'Tf-idf-NGram' model_name = model_name + '_' for dataset in ['Neris', 'Murlo', 'Virut', 'Sogou']: if dataset != 'Neris': model_n = '' line = model_n + ' & ' + dataset + '& ' output_dir = base_dir + dataset X = Learner.obj_from_file(os.path.join(output_dir, model_name + "X.pkl")) line += str(X.shape[1]) + ' & 500 & ' # print X.shape[1] feature_names = Learner.obj_from_file(os.path.join(output_dir, model_name + "feature_names_sel.pkl")) for i in range(1, 5): feature_name = feature_names[i] if len(feature_name) > 8: feature_name = str(feature_name)[0:8] line += feature_name + ', ' line += ' ...\\\\ ' print line
def zero_day_helper(base_dir, src_name, model_name, algorithm, target_name, normal_dir=None): vec_dir = os.path.join(base_dir, src_name) model_path = os.path.join(vec_dir, model_name + algorithm + '_sel.pkl') target_path = os.path.join(base_dir, target_name) if normal_dir is None: data, labels = Learner.gen_instances('', target_path) else: data, labels = Learner.gen_instances(os.path.join(normal_dir, target_name), '') vec = Learner.obj_from_file(os.path.join(vec_dir, model_name + 'vec.pkl')) vec_sel = Learner.obj_from_file(os.path.join(vec_dir, model_name + 'vec_sel.pkl')) data, vocab, vec = Learner.gen_X_matrix(data, vec=vec) return Learner.predict(Learner.obj_from_file(model_path), vec_sel, data, labels=labels, src_name=src_name, model_name=model_name)