Esempio n. 1
0
    def cmp_algorithm_cv(base_dir, normal_dir, data_path, output_dir, model_name='', dataset=''):
        char_wb = False
        if 'tf' in model_name:
            tf = True
        else:
            tf = False
        if 'ngram' in model_name:
            ngram = (2, 15)
            # char_wb = True
        else:
            ngram = None

        classifier_dir = base_dir + dataset
        outfile = os.path.join(classifier_dir, model_name + 'cv_res_sel.json')
        if os.path.exists(outfile):
            return

        if os.path.exists(os.path.join(output_dir, model_name + "vec_sel.pkl")):
            X = Learner.obj_from_file(os.path.join(output_dir, model_name + "X_sel.pkl"))
            y = Learner.obj_from_file(os.path.join(output_dir, model_name + "y_sel.pkl"))
        else:
            instances, y = Learner.gen_instances(os.path.join(normal_dir, 'March'),
                                                 data_path, char_wb=char_wb, simulate=False)
            X, feature_names, vec = Learner.gen_X_matrix(instances, tf=tf, ngrams_range=ngram)

            Learner.save2file(X, os.path.join(output_dir, model_name + "X.pkl"))
            Learner.save2file(y, os.path.join(output_dir, model_name + "y.pkl"))
            Learner.save2file(vec, os.path.join(output_dir, model_name + "vec.pkl"))
            Learner.save2file(feature_names, os.path.join(output_dir, model_name + "feature_names.pkl"))
            X, feature_names, vec = Learner.feature_selection(X, y, 500, vec, instances, tf=tf, ngram_range=ngram)
            Learner.save2file(X, os.path.join(output_dir, model_name + "X_sel.pkl"))
            Learner.save2file(y, os.path.join(output_dir, model_name + "y_sel.pkl"))
            Learner.save2file(vec, os.path.join(output_dir, model_name + "vec_sel.pkl"))
            Learner.save2file(feature_names, os.path.join(output_dir, model_name + "feature_names_sel.pkl"))
        CtuCCAnalyzer.train_and_save(X, y, model_name, classifier_dir)
Esempio n. 2
0
 def feature_tab(base_dir):
     # Open X and output the attribute amount
     for model_name in ['bag', 'bag-ngram', 'tf', 'tf-ngram']:
         if model_name == 'bag':
             model_n = 'Bag-of-word'
         elif model_name == 'tf':
             model_n = 'Tf-idf'
         elif model_name == 'bag-ngram':
             model_n = 'Bag-of-word-NGram'
         else:
             model_n = 'Tf-idf-NGram'
         model_name = model_name + '_'
         for dataset in ['Neris', 'Murlo', 'Virut', 'Sogou']:
             if dataset != 'Neris':
                 model_n = ''
             line = model_n + ' & ' + dataset + '& '
             output_dir = base_dir + dataset
             X = Learner.obj_from_file(os.path.join(output_dir, model_name + "X.pkl"))
             line += str(X.shape[1]) + ' & 500 & '
             # print X.shape[1]
             feature_names = Learner.obj_from_file(os.path.join(output_dir, model_name + "feature_names_sel.pkl"))
             for i in range(1, 5):
                 feature_name = feature_names[i]
                 if len(feature_name) > 8:
                     feature_name = str(feature_name)[0:8]
                 line += feature_name + ', '
             line += ' ...\\\\ '
             print line
Esempio n. 3
0
 def zero_day_helper(base_dir, src_name, model_name, algorithm, target_name, normal_dir=None):
     vec_dir = os.path.join(base_dir, src_name)
     model_path = os.path.join(vec_dir, model_name + algorithm + '_sel.pkl')
     target_path = os.path.join(base_dir, target_name)
     if normal_dir is None:
         data, labels = Learner.gen_instances('', target_path)
     else:
         data, labels = Learner.gen_instances(os.path.join(normal_dir, target_name), '')
     vec = Learner.obj_from_file(os.path.join(vec_dir, model_name + 'vec.pkl'))
     vec_sel = Learner.obj_from_file(os.path.join(vec_dir, model_name + 'vec_sel.pkl'))
     data, vocab, vec = Learner.gen_X_matrix(data, vec=vec)
     return Learner.predict(Learner.obj_from_file(model_path),
                            vec_sel, data, labels=labels,
                            src_name=src_name, model_name=model_name)