def cmp_algorithm_cv(base_dir, normal_dir, data_path, output_dir, model_name='', dataset=''): char_wb = False if 'tf' in model_name: tf = True else: tf = False if 'ngram' in model_name: ngram = (2, 15) # char_wb = True else: ngram = None classifier_dir = base_dir + dataset outfile = os.path.join(classifier_dir, model_name + 'cv_res_sel.json') if os.path.exists(outfile): return if os.path.exists(os.path.join(output_dir, model_name + "vec_sel.pkl")): X = Learner.obj_from_file(os.path.join(output_dir, model_name + "X_sel.pkl")) y = Learner.obj_from_file(os.path.join(output_dir, model_name + "y_sel.pkl")) else: instances, y = Learner.gen_instances(os.path.join(normal_dir, 'March'), data_path, char_wb=char_wb, simulate=False) X, feature_names, vec = Learner.gen_X_matrix(instances, tf=tf, ngrams_range=ngram) Learner.save2file(X, os.path.join(output_dir, model_name + "X.pkl")) Learner.save2file(y, os.path.join(output_dir, model_name + "y.pkl")) Learner.save2file(vec, os.path.join(output_dir, model_name + "vec.pkl")) Learner.save2file(feature_names, os.path.join(output_dir, model_name + "feature_names.pkl")) X, feature_names, vec = Learner.feature_selection(X, y, 500, vec, instances, tf=tf, ngram_range=ngram) Learner.save2file(X, os.path.join(output_dir, model_name + "X_sel.pkl")) Learner.save2file(y, os.path.join(output_dir, model_name + "y_sel.pkl")) Learner.save2file(vec, os.path.join(output_dir, model_name + "vec_sel.pkl")) Learner.save2file(feature_names, os.path.join(output_dir, model_name + "feature_names_sel.pkl")) CtuCCAnalyzer.train_and_save(X, y, model_name, classifier_dir)
def cmp_feature_selection(base_dir, normal_dir, data_path, output_dir, dataset=None): classifier_dir = base_dir + dataset instances, labels = Learner.gen_instances(os.path.join(normal_dir, 'March'), data_path, simulate=False) data, feature_names, vec = Learner.gen_X_matrix(instances) back = [data, labels, feature_names, vec] Learner.save2file(vec.vocabulary_, output_dir + '/' + "vocabulary.pkl") CtuCCAnalyzer.logger.info(data.shape) clf, cv = Learner.train_tree(data, labels, cross_vali=True, tree_name='Fig_tree_' + dataset, output_dir=output_dir) Learner.save2file(clf, classifier_dir + '\\' + 'classifier.pkl') clf_info = Learner.tree_info(clf) clf_info['cv'] = cv simplejson.dump(clf_info, codecs.open(output_dir + '/tree_info.json', 'w', encoding='utf-8')) data, labels, feature_names, vec = back data, feature_names, vec = Learner.feature_selection(data, labels, 200, vec, instances) Learner.save2file(vec.vocabulary, output_dir + '/' + "vocabulary_sel.pkl") CtuCCAnalyzer.logger.info(data.shape) clf, cv = Learner.train_tree(data, labels, cross_vali=True, tree_name='Fig_tree_sel_' + dataset, output_dir=output_dir) Learner.save2file(clf, classifier_dir + '\\' + 'classifier_sel.pkl') clf_info = Learner.tree_info(clf) clf_info['cv'] = cv json.dump(clf_info, codecs.open(output_dir + '/tree_info_sel.json', 'w', encoding='utf-8'))
def zero_day_helper(base_dir, src_name, model_name, algorithm, target_name, normal_dir=None): vec_dir = os.path.join(base_dir, src_name) model_path = os.path.join(vec_dir, model_name + algorithm + '_sel.pkl') target_path = os.path.join(base_dir, target_name) if normal_dir is None: data, labels = Learner.gen_instances('', target_path) else: data, labels = Learner.gen_instances(os.path.join(normal_dir, target_name), '') vec = Learner.obj_from_file(os.path.join(vec_dir, model_name + 'vec.pkl')) vec_sel = Learner.obj_from_file(os.path.join(vec_dir, model_name + 'vec_sel.pkl')) data, vocab, vec = Learner.gen_X_matrix(data, vec=vec) return Learner.predict(Learner.obj_from_file(model_path), vec_sel, data, labels=labels, src_name=src_name, model_name=model_name)