Esempio n. 1
0
    def cmp_algorithm_cv(base_dir, normal_dir, data_path, output_dir, model_name='', dataset=''):
        char_wb = False
        if 'tf' in model_name:
            tf = True
        else:
            tf = False
        if 'ngram' in model_name:
            ngram = (2, 15)
            # char_wb = True
        else:
            ngram = None

        classifier_dir = base_dir + dataset
        outfile = os.path.join(classifier_dir, model_name + 'cv_res_sel.json')
        if os.path.exists(outfile):
            return

        if os.path.exists(os.path.join(output_dir, model_name + "vec_sel.pkl")):
            X = Learner.obj_from_file(os.path.join(output_dir, model_name + "X_sel.pkl"))
            y = Learner.obj_from_file(os.path.join(output_dir, model_name + "y_sel.pkl"))
        else:
            instances, y = Learner.gen_instances(os.path.join(normal_dir, 'March'),
                                                 data_path, char_wb=char_wb, simulate=False)
            X, feature_names, vec = Learner.gen_X_matrix(instances, tf=tf, ngrams_range=ngram)

            Learner.save2file(X, os.path.join(output_dir, model_name + "X.pkl"))
            Learner.save2file(y, os.path.join(output_dir, model_name + "y.pkl"))
            Learner.save2file(vec, os.path.join(output_dir, model_name + "vec.pkl"))
            Learner.save2file(feature_names, os.path.join(output_dir, model_name + "feature_names.pkl"))
            X, feature_names, vec = Learner.feature_selection(X, y, 500, vec, instances, tf=tf, ngram_range=ngram)
            Learner.save2file(X, os.path.join(output_dir, model_name + "X_sel.pkl"))
            Learner.save2file(y, os.path.join(output_dir, model_name + "y_sel.pkl"))
            Learner.save2file(vec, os.path.join(output_dir, model_name + "vec_sel.pkl"))
            Learner.save2file(feature_names, os.path.join(output_dir, model_name + "feature_names_sel.pkl"))
        CtuCCAnalyzer.train_and_save(X, y, model_name, classifier_dir)
Esempio n. 2
0
    def cmp_feature_selection(base_dir, normal_dir, data_path, output_dir, dataset=None):
        classifier_dir = base_dir + dataset
        instances, labels = Learner.gen_instances(os.path.join(normal_dir, 'March'),
                                                  data_path, simulate=False)
        data, feature_names, vec = Learner.gen_X_matrix(instances)
        back = [data, labels, feature_names, vec]

        Learner.save2file(vec.vocabulary_, output_dir + '/' + "vocabulary.pkl")
        CtuCCAnalyzer.logger.info(data.shape)
        clf, cv = Learner.train_tree(data, labels, cross_vali=True,
                                     tree_name='Fig_tree_' + dataset, output_dir=output_dir)
        Learner.save2file(clf, classifier_dir + '\\' + 'classifier.pkl')

        clf_info = Learner.tree_info(clf)
        clf_info['cv'] = cv

        simplejson.dump(clf_info, codecs.open(output_dir + '/tree_info.json', 'w', encoding='utf-8'))

        data, labels, feature_names, vec = back
        data, feature_names, vec = Learner.feature_selection(data, labels, 200, vec, instances)

        Learner.save2file(vec.vocabulary, output_dir + '/' + "vocabulary_sel.pkl")
        CtuCCAnalyzer.logger.info(data.shape)
        clf, cv = Learner.train_tree(data, labels, cross_vali=True,
                                     tree_name='Fig_tree_sel_' + dataset, output_dir=output_dir)
        Learner.save2file(clf, classifier_dir + '\\' + 'classifier_sel.pkl')

        clf_info = Learner.tree_info(clf)
        clf_info['cv'] = cv

        json.dump(clf_info, codecs.open(output_dir + '/tree_info_sel.json', 'w', encoding='utf-8'))
Esempio n. 3
0
 def zero_day_helper(base_dir, src_name, model_name, algorithm, target_name, normal_dir=None):
     vec_dir = os.path.join(base_dir, src_name)
     model_path = os.path.join(vec_dir, model_name + algorithm + '_sel.pkl')
     target_path = os.path.join(base_dir, target_name)
     if normal_dir is None:
         data, labels = Learner.gen_instances('', target_path)
     else:
         data, labels = Learner.gen_instances(os.path.join(normal_dir, target_name), '')
     vec = Learner.obj_from_file(os.path.join(vec_dir, model_name + 'vec.pkl'))
     vec_sel = Learner.obj_from_file(os.path.join(vec_dir, model_name + 'vec_sel.pkl'))
     data, vocab, vec = Learner.gen_X_matrix(data, vec=vec)
     return Learner.predict(Learner.obj_from_file(model_path),
                            vec_sel, data, labels=labels,
                            src_name=src_name, model_name=model_name)