from ui_exerciser import UIExerciser
from utils import Utilities
import re
import os
import time

if __name__ == '__main__':
    ISOTIMEFORMAT = '%m%d-%H-%M-%S'
    logger = Utilities.set_logger('COSMOS_TRIGGER_PY-Console')

    device = 'nexus4'
    pc = 'iai'

    if device == 'nexus4':
        series = '01b7006e13dd12a1'
    elif device == 'galaxy':
        series = '014E233C1300800B'
    elif device == 'nexuss':
        series = '39302E8CEA9B00EC'
    else:
        series = 'emulator-5554'

    user = '******'
    aapt_loc = 'C:\Users\\' + user + '\AppData\Local\Android\sdk/build-tools/19.1.0/aapt.exe'
    apk_dir = 'C:\Users\\' + user + '\Documents\FlowIntent\\apks\\VirusShare_Android_20130506_3\\'
    UIExerciser.emu_loc = 'C:\Users\hfu\AppData\Local\Android\sdk/tools/emulator.exe'
    UIExerciser.emu_name = 'Qvga'

    out_base_dir = os.path.abspath(os.pardir + '/output/') + '/'

    #UIExerciser.emu_proc = UIExerciser.open_emu(UIExerciser.emu_loc, UIExerciser.emu_name)
Exemple #2
0
class Learner:
    logger = Utilities.set_logger('Learner')

    class LabelledDocs:
        def stem_tokens(tokens, stemmer):
            stemmed = []
            for item in tokens:
                stemmed.append(stemmer.stem(item))
            return stemmed

        def tokenize(self, text):
            vectorizer = CountVectorizer(analyzer='word')
            vectorizer.fit_transform([text])
            tokens = vectorizer.get_feature_names()
            # stems = self.stem_tokens(tokens, stemmer)
            return tokens

        def __init__(self, doc, label, char_wb=False):
            self.doc = doc
            self.label = label
            tokens = self.tokenize(doc)
            if char_wb:
                self.doc = ''.join(tokens)
            else:
                self.doc = ' '.join(tokens)

    @staticmethod
    def dir2jsons(json_dir):
        jsons = []
        if json_dir is None:
            return jsons
        for root, dirs, files in os.walk(json_dir, topdown=False):
            for filename in files:
                if '201' in filename and re.search('json$', filename):
                    with open(os.path.join(root, filename), "rb") as fin:
                        try:
                            jsons.append(simplejson.load(fin))
                        except Exception as e:
                            pass
                            # Utilities.logger.error(e)
        return jsons

    @staticmethod
    def same_prefix(str_a, str_b):
        for i, c in enumerate(str_a):
            if i > 6:
                return True
            if c == str_b[i]:
                continue
            else:
                return False

    @staticmethod
    def feature_filter_by_prefix(vocab, docs):
        examined = []
        for i in range(len(vocab)):
            Learner.logger.info('i: ' + vocab[i] + ' ' + str(i))
            if len(vocab[i]) < 6 or vocab[i] in examined:
                continue
            for j in range(i + 1, len(vocab)):
                # Learner.logger.info('j: ' + vocab[j] + ' ' + str(j))
                if len(vocab[j]) < 6:
                    examined.append(vocab[j])
                    continue
                if vocab[i] in vocab[j] or vocab[j] in vocab[
                        i]:  # Learner.same_prefix(vocab[i], vocab[j]):
                    # Learner.logger.info('Found ' + vocab[i] + ' ' + vocab[j] + ' ' + str(i))
                    examined.append(vocab[j])
                    for doc in docs:
                        if vocab[j] in doc.doc:
                            doc.doc = str(doc.doc).replace(vocab[j], vocab[i])
        instances = []
        labels = []
        for doc in docs:
            instances.append(doc.doc)
            labels.append(doc.label)
        vectorizer = StemmedCountVectorizer(analyzer="word",
                                            tokenizer=None,
                                            preprocessor=None,
                                            stop_words=None)
        train_data = vectorizer.fit_transform(instances)

        # Numpy arrays are easy to work with, so convert the result to an
        # array
        # train_data = train_data.toarray()
        Learner.logger.info(train_data.shape)
        return train_data, labels

    @staticmethod
    def gen_instances(pos_json_dir,
                      neg_json_dir,
                      simulate=False,
                      char_wb=False):
        pos_jsons = Learner.dir2jsons(pos_json_dir)
        neg_jsons = Learner.dir2jsons(neg_json_dir)
        Learner.logger.info('lenPos: ' + str(len(pos_jsons)))
        Learner.logger.info('lenNeg: ' + str(len(neg_jsons)))
        docs = Learner.gen_docs(pos_jsons, 1, char_wb)
        docs = docs + (Learner.gen_docs(neg_jsons, -1, char_wb))
        if simulate:
            if len(neg_jsons) == 0:
                docs = docs + Learner.simulate_flows(len(pos_jsons), 0)
        instances = []
        labels = []
        for doc in docs:
            instances.append(doc.doc)
            labels.append(doc.label)

        return instances, np.array(labels)

    @staticmethod
    def gen_X_matrix(instances, vec=None, tf=False, ngrams_range=None):
        # Initialize the "CountVectorizer" object, which is scikit-learn's
        # bag of words tool.
        if vec is not None:
            train_data = vec.transform(instances)
            vocab = vec.get_feature_names()
            return train_data, vocab, vec
        if not tf:
            if ngrams_range is None:
                vectorizer = StemmedCountVectorizer(analyzer="word",
                                                    tokenizer=None,
                                                    preprocessor=None,
                                                    stop_words=['http'])
            else:
                vectorizer = StemmedCountVectorizer(analyzer='char_wb',
                                                    tokenizer=None,
                                                    preprocessor=None,
                                                    stop_words=['http'],
                                                    ngram_range=ngrams_range)
        else:
            if ngrams_range is None:
                vectorizer = StemmedTfidfVectorizer(analyzer="word",
                                                    tokenizer=None,
                                                    preprocessor=None,
                                                    stop_words=['http'])
            else:
                vectorizer = StemmedTfidfVectorizer(analyzer='char_wb',
                                                    tokenizer=None,
                                                    preprocessor=None,
                                                    stop_words=None,
                                                    ngram_range=ngrams_range)
        # fit_transform() does two functions: First, it fits the model
        # and learns the vocabulary; second, it transforms our training data
        # into feature vectors. The input to fit_transform should be a list of
        # strings.
        train_data = vectorizer.fit_transform(instances)

        # Numpy arrays are easy to work with, so convert the result to an
        # array
        # train_data = train_data.toarray()
        Learner.logger.info(train_data.shape)
        # Take a look at the words in the vocabulary
        vocab = vectorizer.get_feature_names()
        # Learner.logger.info(vocab)
        # train_data, labels = Learner.feature_filter_by_prefix(vocab, docs)

        return train_data, vocab, vectorizer

    @staticmethod
    def ocsvm(train_data, labels, cross_vali=True):
        nu = float(np.count_nonzero(labels == -1)) / len(labels)
        clf = svm.OneClassSVM(nu=nu, kernel="rbf", gamma=0.1)
        results = None
        if cross_vali:
            results = Learner.cross_validation(clf, train_data, labels)
            # simplejson.dump(results.tolist(), codecs.open(output_dir + '/cv.json', 'w', encoding='utf-8'),
            # separators=(',', ':'), sort_keys=True, indent=4)
            Learner.logger.info('OCSVM: ' + str(results['duration']))
            Learner.logger.info('mean scores:' + str(results['mean_scores']))
            Learner.logger.info('mean_conf:' + str(results['mean_conf_mat']))

        clf.fit(train_data)

        return clf, results

    @staticmethod
    def train_bayes(train_data, labels, cross_vali=True):
        clf = BernoulliNB()
        results = None
        if cross_vali:
            results = Learner.cross_validation(clf, train_data, labels)
            # simplejson.dump(results.tolist(), codecs.open(output_dir + '/cv.json', 'w', encoding='utf-8'),
            # separators=(',', ':'), sort_keys=True, indent=4)
            Learner.logger.info('Bayes: ' + str(results['duration']))
            Learner.logger.info('mean scores:' + str(results['mean_scores']))
            Learner.logger.info('mean_conf:' + str(results['mean_conf_mat']))

        # Fit the forest to the training set, using the bag of words as
        # features and the sentiment labels as the response variable
        #
        # This may take a few minutes to run
        clf = clf.fit(train_data, labels)

        return clf, results

    @staticmethod
    def class_report(conf_mat):
        tp, fp, fn, tn = conf_mat.flatten()
        measures = {
            'accuracy': (tp + tn) / (tp + fp + fn + tn),
            'fp_rate': fp / (tn + fp),
            'recall': tp / (tp + fn),
            'precision': tp / (tp + fp),
            'f1score': 2 * tp / (2 * tp + fp + fn)
        }
        # measures['tn_rate'] = tn / (tn + fp)  # (true negative rate)
        return measures

    @staticmethod
    def cross_validation(clf, data, labels, scoring='f1', n_fold=5):
        X = data
        y = np.array(labels)
        ''' Run x-validation and return scores, averaged confusion matrix, and df with false positives and negatives '''
        t0 = time()
        results = dict()
        # cv = KFold(n_splits=5, shuffle=True)

        # I generate a KFold in order to make cross validation
        shuffle = True
        kf = StratifiedKFold(n_splits=n_fold, shuffle=shuffle, random_state=42)
        scores = []
        conf_mat = np.zeros((2, 2))  # Binary classification

        # I start the cross validation
        for fold, (train_index, test_index) in enumerate(kf.split(X, y)):
            result = dict()
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            # I train the classifier
            clf.fit(X_train, y_train)

            # I make the predictions
            predicted = clf.predict(X_test)
            y_plabs = np.squeeze(predicted)
            if hasattr(clf, 'predict_proba'):
                y_pprobs = clf.predict_proba(X_test)  # Predicted probabilitie
                result['roc'] = metrics.roc_auc_score(y_test, y_pprobs[:, 1])
            else:  # for SVM
                y_decision = clf.decision_function(X_test)
                try:
                    result['roc'] = metrics.roc_auc_score(
                        y_test, y_decision[:, 1])
                except:  # OCSVM
                    result['roc'] = metrics.roc_auc_score(y_test, y_decision)
            # metrics.roc_curve(y_test, y_pprobs[:, 1])
            scores.append(result['roc'])

            # Learner.perf_measure(predicted, y_test)

            # I obtain the accuracy of this fold
            # ac = accuracy_score(predicted, y_test)

            # I obtain the confusion matrix
            confusion = metrics.confusion_matrix(y_test, predicted)
            conf_mat += confusion
            result['conf_mat'] = confusion.tolist()

            # Collect indices of false positive and negatives, effective only shuffle=False, or backup the original data
            if not shuffle:
                fp_i = np.where((y_plabs == 1) & (y_test == -1))[0]
                fn_i = np.where((y_plabs == -1) & (y_test == 1))[0]
                result['fp_item'] = test_index[fp_i]
                result['fn_item'] = test_index[fn_i]
            results['fold_' + str(fold)] = result

        # cv_res = cross_val_score(clf, data, labels, cv=cv, scoring='f1').tolist()
        # simplejson.dump(results.tolist(), codecs.open(output_dir + '/cv.json', 'w', encoding='utf-8'),
        # separators=(',', ':'), sort_keys=True, indent=4)
        duration = time() - t0
        results['duration'] = duration
        # results['cv_res'] = cv_res
        # results['cv_res_mean'] = sum(cv_res) / n_splits

        # print "\nMean score: %0.2f (+/- %0.2f)" % (np.mean(scores), np.std(scores) * 2)
        results['mean_scores'] = np.mean(scores)
        results['std_scores'] = np.std(scores)
        conf_mat /= n_fold
        # print "Mean CM: \n", conf_mat

        # print "\nMean classification measures: \n"
        results['mean_conf_mat'] = Learner.class_report(conf_mat)
        # return scores, conf_mat, {'fp': sorted(false_pos), 'fn': sorted(false_neg)}
        return results

    @staticmethod
    def train_SVM(train_data, labels, cross_vali=True):
        clf = svm.SVC(class_weight='balanced', probability=True)
        results = None
        if cross_vali == True:
            results = Learner.cross_validation(clf, train_data, labels)
            # simplejson.dump(results.tolist(), codecs.open(output_dir + '/cv.json', 'w', encoding='utf-8'),
            # separators=(',', ':'), sort_keys=True, indent=4)
            Learner.logger.info('SVM: ' + str(results['duration']))
            Learner.logger.info('mean scores:' + str(results['mean_scores']))
            Learner.logger.info('mean_conf:' + str(results['mean_conf_mat']))

        # Fit the forest to the training set, using the bag of words as
        # features and the sentiment labels as the response variable
        #
        # This may take a few minutes to run
        clf = clf.fit(train_data, labels)

        return clf, results

    @staticmethod
    def train_logistic(train_data, labels, cross_vali=True):
        clf = LogisticRegression(class_weight='balanced')
        results = None
        if cross_vali == True:
            results = Learner.cross_validation(clf, train_data, labels)
            # simplejson.dump(results.tolist(), codecs.open(output_dir + '/cv.json', 'w', encoding='utf-8'),
            # separators=(',', ':'), sort_keys=True, indent=4)
            Learner.logger.info('Logistic: ' + str(results['duration']))
            Learner.logger.info('mean scores:' + str(results['mean_scores']))
            Learner.logger.info('mean_conf:' + str(results['mean_conf_mat']))

        # Fit the forest to the training set, using the bag of words as
        # features and the sentiment labels as the response variable
        #
        # This may take a few minutes to run
        clf = clf.fit(train_data, labels)

        return clf, results

    @staticmethod
    def train_tree(train_data,
                   labels,
                   cross_vali=True,
                   res=None,
                   output_dir=os.curdir,
                   tree_name='tree'):
        clf = DecisionTreeClassifier(class_weight='balanced')
        results = None
        if cross_vali == True:
            results = Learner.cross_validation(clf, train_data, labels)
            # simplejson.dump(results.tolist(), codecs.open(output_dir + '/cv.json', 'w', encoding='utf-8'),
            # separators=(',', ':'), sort_keys=True, indent=4)
            Learner.logger.info('Tree: ' + str(results['duration']))
            Learner.logger.info('mean scores:' + str(results['mean_scores']))
            Learner.logger.info('mean_conf:' + str(results['mean_conf_mat']))

        # Fit the forest to the training set, using the bag of words as
        # features and the sentiment labels as the response variable
        #
        # This may take a few minutes to run
        clf = clf.fit(train_data, labels)
        """
        tree.export_graphviz(clf, out_file=output_dir + '/' + tree_name + '.dot',
                             feature_names=feature_names,
                             label='root', impurity=False, special_characters=True)  # , max_depth=5)
        dotfile = open(output_dir + '/' + tree_name + '.dot', 'r')
        graph = pydotplus.graph_from_dot_data(dotfile.read())
        graph.write_pdf(output_dir + '/' + tree_name + '.pdf')
        dotfile.close()
        """
        if res is not None:
            res['tree'] = results
        return clf, results

    @staticmethod
    def train_classifier(func, X, y, cv, result_dict, tag):
        result_dict[tag] = func(X, y, cv)

    @staticmethod
    def rand_str(size=6, chars=string.ascii_uppercase + string.digits):
        url = ''.join(random.choice(chars) for _ in range(size))
        if url[0] < 'k':
            url = url + 'net'
        else:
            url = url + 'com'
        url = 'www.' + url
        return url

    @staticmethod
    def simulate_flows(size, label):
        docs = []
        for _ in range(size):
            docs.append(
                Learner.LabelledDocs('www.' + Learner.rand_str() + '', label))
        return docs

    @staticmethod
    def tree_info(clf):
        info = dict()
        n_nodes = clf.tree_.node_count
        # children_left = clf.tree_.children_left
        # children_right = clf.tree_.children_right
        # feature = clf.tree_.max_features
        # n_feature = clf.tree_.n_features_
        # The tree structure can be traversed to compute various properties such
        # as the depth of each node and whether or not it is a leaf.
        depth = clf.tree_.max_depth
        info['n_nodes'] = n_nodes
        info['depth'] = depth
        Learner.logger.info(info)
        return info

    @staticmethod
    def gen_docs(jsons, label, char_wb=False):
        docs = []
        for flow in jsons:
            label = label  # flow['label']
            line = ''
            line += flow['domain']
            line += flow['uri']
            try:
                docs.append(Learner.LabelledDocs(line, label, char_wb=char_wb))
            except:
                print line
        return docs

    @staticmethod
    def predict(model,
                vec,
                instances,
                labels=None,
                src_name='',
                model_name=''):
        # loaded_vec = CountVectorizer(decode_error="replace", vocabulary=voc)
        data = vec.transform(instances)
        y_1 = model.predict(data)

        # Learner.logger.info(y_1)
        if labels is not None:
            return accuracy_score(labels, y_1)

    @staticmethod
    def feature_selection(X,
                          y,
                          k,
                          count_vectorizer,
                          instances,
                          tf=False,
                          ngram_range=None):
        ch2 = SelectKBest(chi2, k=k)
        X_new = ch2.fit_transform(X, y)
        feature_names = count_vectorizer.get_feature_names()
        if feature_names != None:
            feature_names = [
                feature_names[i] for i in ch2.get_support(indices=True)
            ]
        '''
        dict = np.asarray(count_vectorizer.get_feature_names())[ch2.get_support()]
        if tf:
            if ngram_range is not None:
                count_vectorizer = StemmedTfidfVectorizer(analyzer='char_wb', ngram_range=ngram_range, vocabulary=dict)
            else:
                count_vectorizer = StemmedTfidfVectorizer(analyzer='char_wb', vocabulary=dict)
        else:
            if ngram_range is not None:
                count_vectorizer = StemmedCountVectorizer(analyzer='word', vocabulary=dict, ngram_range=ngram_range)
            else:
                count_vectorizer = StemmedCountVectorizer(analyzer="word", vocabulary=dict)
        X_new = count_vectorizer.fit_transform(instances)
        # cPickle.dump(count_vectorizer.vocabulary, open(output_dir + '/' + "vocabulary.pkl", "wb"))
        '''
        return X_new, feature_names, ch2

    @staticmethod
    def pipe_feature_selection(X, y):
        clf = Pipeline([('feature_selection',
                         SelectKBest(chi2, k=2).fit_transform(X, y)),
                        ('classification', RandomForestClassifier())])
        clf.fit(X, y)

    @staticmethod
    def save2file(obj, path):
        # save the obj
        with open(path, 'wb') as fid:
            cPickle.dump(obj, fid)

    @staticmethod
    def obj_from_file(path):
        return cPickle.load(open(path, 'rb'))
Exemple #3
0
        else:
            if ngram_range is not None:
                count_vectorizer = StemmedCountVectorizer(analyzer='word', vocabulary=dict, ngram_range=ngram_range)
            else:
                count_vectorizer = StemmedCountVectorizer(analyzer="word", vocabulary=dict)
        X_new = count_vectorizer.fit_transform(instances)
        # cPickle.dump(count_vectorizer.vocabulary, open(output_dir + '/' + "vocabulary.pkl", "wb"))
        '''
        return X_new, feature_names, ch2

    @staticmethod
    def pipe_feature_selection(X, y):
        clf = Pipeline([('feature_selection',
                         SelectKBest(chi2, k=2).fit_transform(X, y)),
                        ('classification', RandomForestClassifier())])
        clf.fit(X, y)

    @staticmethod
    def save2file(obj, path):
        # save the obj
        with open(path, 'wb') as fid:
            cPickle.dump(obj, fid)

    @staticmethod
    def obj_from_file(path):
        return cPickle.load(open(path, 'rb'))


if __name__ == '__main__':
    logger = Utilities.set_logger('Learner')
Exemple #4
0
class CtuAdAnalyzer:
    logger = Utilities.set_logger('CTU-Ad')

    @staticmethod
    def cv_result_table(base_dir):
        for model_name in ['bag', 'bag-ngram', 'tf', 'tf-ngram']:
            print '\\\\'
            model = dict()

            model_name = model_name + '_'

            for dataset in ['']:
                output_dir = base_dir + dataset

                with open(
                        os.path.join(output_dir,
                                     model_name + 'cv_res_sel.json'),
                        "rb") as fin:
                    cv_res = simplejson.load(fin)
                    # print cv_res
                    for algorithm in cv_res:
                        if algorithm not in model:
                            model[algorithm] = dict()
                        results = cv_res[algorithm]
                        model[algorithm][dataset] = results
                        # print(algorithm + ': ' + str(results['duration']))
                        # print('mean scores:' + str(results['mean_scores']))
                        # print('mean_conf:' + str(results['mean_conf_mat']))

            for algorithm in ['tree', 'bayes', 'logistic', 'svm', 'ocsvm']:
                if algorithm == 'tree':
                    algorithm_name = 'Decision Tree'
                elif algorithm == 'bayes':
                    algorithm_name = 'Naive Bayes'
                elif algorithm == 'logistic':
                    algorithm_name = 'Logistic Regreesion'
                elif algorithm == 'svm':
                    algorithm_name = 'SVM'
                else:
                    algorithm_name = 'OCSVM'

                for dataset in ['']:
                    results = model[algorithm][dataset]
                    mean_conf = results['mean_conf_mat']
                    recall = str('{:.3%}'.format(mean_conf['recall'])).replace(
                        '%', '\%')
                    fp = str('{:.3%}'.format(mean_conf['fp_rate'])).replace(
                        '%', '\%')
                    precision = str('{:.3%}'.format(
                        mean_conf['precision'])).replace('%', '\%')
                    f1 = str('{:.3%}'.format(mean_conf['f1score'])).replace(
                        '%', '\%')
                    mean_score = str('{:.3%}'.format(
                        results['mean_scores'])).replace('%', '\%')
                    duration = str('{:.3}'.format(results['duration']))
                    print ' & ' + algorithm_name + ' & ' + duration + ' & ' + recall \
                          + ' & ' + fp + ' & ' + precision \
                          + ' & ' + f1 + ' & ' + mean_score + ' \\\\ '

    @staticmethod
    def cmp_model_cv(base_dir, normal_dir):
        """
        Cmp between bag-of-words, Tf-idf, bag-ngrams, Tf-ngrams
        :return:
        """
        classifier_dir = base_dir
        """
        Pool().map(CtuCCAnalyzer.cmp_algorithm_cv, [base_dir, base_dir, base_dir, base_dir],
                   [normal_dir, normal_dir, normal_dir, normal_dir],
                   [classifier_dir, classifier_dir, classifier_dir, classifier_dir],
                   ['bag_', 'bag-ngram_', 'tf_', 'tf-ngram_'])
        """
        threads = dict()
        for model_name in ['bag', 'bag-ngram', 'tf', 'tf-ngram']:
            threads[model_name] = Thread(target=CtuCCAnalyzer.cmp_algorithm_cv,
                                         args=(base_dir, normal_dir,
                                               classifier_dir, classifier_dir,
                                               model_name + '_'))
            threads[model_name].start()

        for model_name in threads:
            threads[model_name].join()
class CtuCCAnalyzer:
    logger = Utilities.set_logger('CTU-13-CC')

    @staticmethod
    def cmp_feature_selection(base_dir, normal_dir, data_path, output_dir, dataset=None):
        classifier_dir = base_dir + dataset
        instances, labels = Learner.gen_instances(os.path.join(normal_dir, 'March'),
                                                  data_path, simulate=False)
        data, feature_names, vec = Learner.gen_X_matrix(instances)
        back = [data, labels, feature_names, vec]

        Learner.save2file(vec.vocabulary_, output_dir + '/' + "vocabulary.pkl")
        CtuCCAnalyzer.logger.info(data.shape)
        clf, cv = Learner.train_tree(data, labels, cross_vali=True,
                                     tree_name='Fig_tree_' + dataset, output_dir=output_dir)
        Learner.save2file(clf, classifier_dir + '\\' + 'classifier.pkl')

        clf_info = Learner.tree_info(clf)
        clf_info['cv'] = cv

        simplejson.dump(clf_info, codecs.open(output_dir + '/tree_info.json', 'w', encoding='utf-8'))

        data, labels, feature_names, vec = back
        data, feature_names, vec = Learner.feature_selection(data, labels, 200, vec, instances)

        Learner.save2file(vec.vocabulary, output_dir + '/' + "vocabulary_sel.pkl")
        CtuCCAnalyzer.logger.info(data.shape)
        clf, cv = Learner.train_tree(data, labels, cross_vali=True,
                                     tree_name='Fig_tree_sel_' + dataset, output_dir=output_dir)
        Learner.save2file(clf, classifier_dir + '\\' + 'classifier_sel.pkl')

        clf_info = Learner.tree_info(clf)
        clf_info['cv'] = cv

        json.dump(clf_info, codecs.open(output_dir + '/tree_info_sel.json', 'w', encoding='utf-8'))
        # simplejson.dump(results.tolist(), codecs.open(output_dir + '/cv.json', 'w', encoding='utf-8'))
        # separators=(',', ':'), sort_keys=True, indent=4)

    @staticmethod
    def cmp_model_cv(base_dir, normal_dir):
        """
        Cmp between bag-of-words, Tf-idf, bag-ngrams, Tf-ngrams
        :return:
        """
        for model_name in ['bag']:  # 'bag-ngram', 'tf', 'tf-ngram']:
            CtuCCAnalyzer.logger.info(model_name + "----------------------------------")
            for dataset in ['Neris', 'Murlo', 'Virut', 'Sogou']:
                classifier_dir = base_dir + dataset
                CtuCCAnalyzer.cmp_algorithm_cv(base_dir, normal_dir, classifier_dir, classifier_dir,
                                               dataset=dataset, model_name=model_name + '_')

    @staticmethod
    def train_and_save(X, y, model_name, classifier_dir):
        outfile = os.path.join(classifier_dir, model_name + 'cv_res_sel.json')
        cv_res = dict()
        results = dict()
        thread1 = Thread(target=Learner.train_classifier, args=(Learner.train_tree, X, y, True, results, 'tree'))
        thread2 = Thread(target=Learner.train_classifier, args=(Learner.train_bayes, X, y, True, results, 'bayes'))
        thread3 = Thread(target=Learner.train_classifier,
                         args=(Learner.train_logistic, X, y, True, results, 'logistic'))
        thread4 = Thread(target=Learner.train_classifier, args=(Learner.train_SVM, X, y, True, results, 'svm'))
        thread5 = Thread(target=Learner.train_classifier, args=(Learner.ocsvm, X, y, True, results, 'ocsvm'))

        thread1.start()
        thread2.start()
        thread3.start()
        thread4.start()
        thread5.start()

        thread1.join()
        thread2.join()
        thread3.join()
        thread4.join()
        thread5.join()

        clf_tree, cv_res['tree'] = results['tree']
        clf_bayes, cv_res['bayes'] = results['bayes']
        clf_logistic, cv_res['logistic'] = results['logistic']
        clf_svm, cv_res['svm'] = results['svm']
        clf_ocsvm, cv_res['ocsvm'] = results['ocsvm']
        Learner.save2file(clf_tree, os.path.join(classifier_dir, model_name + 'tree_sel.pkl'))
        Learner.save2file(clf_bayes, os.path.join(classifier_dir, model_name + 'bayes_sel.pkl'))
        Learner.save2file(clf_logistic, os.path.join(classifier_dir, model_name + 'logistic_sel.pkl'))
        Learner.save2file(clf_svm, os.path.join(classifier_dir, model_name + 'svm_sel.pkl'))
        Learner.save2file(clf_ocsvm, os.path.join(classifier_dir, model_name + 'ocsvm_sel.pkl'))
        CtuCCAnalyzer.logger.info('Threads Done! Saving cv_res...')
        json.dump(cv_res, codecs.open(outfile, 'w', encoding='utf-8'))
        """

        result1, result2, result3, result4, result5 = Pool().map(Learner.train_classifier,
                            [Learner.train_tree, Learner.train_bayes, Learner.train_logistic, Learner.train_SVM, Learner.ocsvm],
                            [X, X, X, X, X], [y, y, y, y, y], [True, True, True, True, True])

        clf_tree, cv_res['tree'] = result1
        clf_bayes, cv_res['bayes'] = result2
        clf_logistic, cv_res['logistic'] = result3
        clf_svm, cv_res['svm'] = result4
        clf_ocsvm, cv_res['ocsvm'] = result5
        Learner.save2file(clf_tree, os.path.join(classifier_dir, model_name + 'tree_sel.pkl'))
        Learner.save2file(clf_bayes, os.path.join(classifier_dir, model_name + 'bayes_sel.pkl'))
        Learner.save2file(clf_logistic, os.path.join(classifier_dir, model_name + 'logistic_sel.pkl'))
        Learner.save2file(clf_svm, os.path.join(classifier_dir, model_name + 'svm_sel.pkl'))
        Learner.save2file(clf_ocsvm, os.path.join(classifier_dir, model_name + 'ocsvm_sel.pkl'))
        json.dump(cv_res,
                  codecs.open(os.path.join(classifier_dir, model_name + 'cv_res_sel.json'), 'w', encoding='utf-8'))
        '''

        result1 = Pool().map(Learner.train_tree, [X,], [y], [True])
        result2 = Pool().map(Learner.train_bayes, [X], [y], [True])
        result3 = Pool().map(Learner.train_logistic, [X], [y], [True])
        result4 = Pool().map(Learner.train_SVM, [X], [y], [True])
        result5 = Pool().map(Learner.ocsvm, [X], [y], [True])
        '''
        """

    @staticmethod
    def cmp_algorithm_cv(base_dir, normal_dir, data_path, output_dir, model_name='', dataset=''):
        char_wb = False
        if 'tf' in model_name:
            tf = True
        else:
            tf = False
        if 'ngram' in model_name:
            ngram = (2, 15)
            # char_wb = True
        else:
            ngram = None

        classifier_dir = base_dir + dataset
        outfile = os.path.join(classifier_dir, model_name + 'cv_res_sel.json')
        if os.path.exists(outfile):
            return

        if os.path.exists(os.path.join(output_dir, model_name + "vec_sel.pkl")):
            X = Learner.obj_from_file(os.path.join(output_dir, model_name + "X_sel.pkl"))
            y = Learner.obj_from_file(os.path.join(output_dir, model_name + "y_sel.pkl"))
        else:
            instances, y = Learner.gen_instances(os.path.join(normal_dir, 'March'),
                                                 data_path, char_wb=char_wb, simulate=False)
            X, feature_names, vec = Learner.gen_X_matrix(instances, tf=tf, ngrams_range=ngram)

            Learner.save2file(X, os.path.join(output_dir, model_name + "X.pkl"))
            Learner.save2file(y, os.path.join(output_dir, model_name + "y.pkl"))
            Learner.save2file(vec, os.path.join(output_dir, model_name + "vec.pkl"))
            Learner.save2file(feature_names, os.path.join(output_dir, model_name + "feature_names.pkl"))
            X, feature_names, vec = Learner.feature_selection(X, y, 500, vec, instances, tf=tf, ngram_range=ngram)
            Learner.save2file(X, os.path.join(output_dir, model_name + "X_sel.pkl"))
            Learner.save2file(y, os.path.join(output_dir, model_name + "y_sel.pkl"))
            Learner.save2file(vec, os.path.join(output_dir, model_name + "vec_sel.pkl"))
            Learner.save2file(feature_names, os.path.join(output_dir, model_name + "feature_names_sel.pkl"))
        CtuCCAnalyzer.train_and_save(X, y, model_name, classifier_dir)

    @staticmethod
    def zero_day_helper(base_dir, src_name, model_name, algorithm, target_name, normal_dir=None):
        vec_dir = os.path.join(base_dir, src_name)
        model_path = os.path.join(vec_dir, model_name + algorithm + '_sel.pkl')
        target_path = os.path.join(base_dir, target_name)
        if normal_dir is None:
            data, labels = Learner.gen_instances('', target_path)
        else:
            data, labels = Learner.gen_instances(os.path.join(normal_dir, target_name), '')
        vec = Learner.obj_from_file(os.path.join(vec_dir, model_name + 'vec.pkl'))
        vec_sel = Learner.obj_from_file(os.path.join(vec_dir, model_name + 'vec_sel.pkl'))
        data, vocab, vec = Learner.gen_X_matrix(data, vec=vec)
        return Learner.predict(Learner.obj_from_file(model_path),
                               vec_sel, data, labels=labels,
                               src_name=src_name, model_name=model_name)

    @staticmethod
    def zero_day_sub(base_dir, normal_dir, model_name, output_dir):
        if os.path.exists(os.path.join(output_dir, model_name + 'pred_res.json')):
            return
        results = dict()
        for algorithm in ['tree', 'bayes', 'logistic', 'svm', 'ocsvm']:
            for src_name in ['Neris', 'Murlo', 'Virut', 'Sogou']:
                for target_name in ['Neris', 'Murlo', 'Virut', 'Sogou']:
                    res = CtuCCAnalyzer.zero_day_helper(base_dir, src_name, model_name, algorithm, target_name)
                    if algorithm not in results:
                        results[algorithm] = dict()
                    if src_name not in results[algorithm]:
                        results[algorithm][src_name] = dict()
                    results[algorithm][src_name][target_name] = res
                    # name = src_name + '_' + model_name + '_' + target_name
                    # CtuCCAnalyzer.logger.info(name + ':' + str(res))
                target_name = 'April'
                res = CtuCCAnalyzer.zero_day_helper(base_dir, src_name, model_name, algorithm, target_name,
                                                    normal_dir=normal_dir)
                # name = src_name + '_' + model_name + '_' + target_name
                # CtuCCAnalyzer.logger.info(name + ':' + str(res))
                results[algorithm][src_name][target_name] = res
        json.dump(results, codecs.open(os.path.join(output_dir, model_name + 'pred_res.json'), 'w', encoding='utf-8'))

        for algorithm in ['tree', 'bayes', 'logistic', 'svm', 'ocsvm']:
            for src_name in ['Neris', 'Murlo', 'Virut', 'Sogou']:
                output = ''
                for target_name in ['Neris', 'Murlo', 'Virut', 'Sogou']:
                    output = output + str(results[algorithm][src_name][target_name] * 100) + '\%' + ' & '
                CtuCCAnalyzer.logger.info(algorithm + ' & ' + src_name + ' & ' + output)

    @staticmethod
    def zero_day(base_dir, normal_dir):
        for model_name in ['bag']:  # ['bag', 'bag-ngram', 'tf', 'tf-ngram']:
            CtuCCAnalyzer.zero_day_sub(base_dir, normal_dir, model_name + '_', base_dir)
class ViewClientHandler:
    logger = Utilities.set_logger('ViewClientHandler')

    @staticmethod
    def traverse(vc, root="ROOT", indent="", transform=None, stream=sys.stdout, bounds2id={}):
        '''
        Traverses the C{View} tree and prints its nodes.

        The nodes are printed converting them to string but other transformations can be specified
        by providing a method name as the C{transform} parameter.

        @type root: L{View}
        @param root: the root node from where the traverse starts
        @type indent: str
        @param indent: the indentation string to use to print the nodes
        @type transform: method
        @param transform: a method to use to transform the node before is printed
        '''

        if transform is None:
            # this cannot be a default value, otherwise
            # TypeError: 'staticmethod' object is not callable
            # is raised
            transform = ViewClient.TRAVERSE_CIT

        if type(root) == types.StringType and root == "ROOT":
            root = vc.root

        print vc.list()
        xml_root = ET.Element('hierarchy')
        ViewClientHandler.__traverse(root, indent, transform, stream, bounds2id=bounds2id)
        return bounds2id

        #         if not root:
        #             return
        #
        #         s = transform(root)
        #         if s:
        #             print >>stream, "%s%s" % (indent, s)
        #
        #         for ch in root.children:
        #             self.traverse(ch, indent=indent+"   ", transform=transform, stream=stream)

    @staticmethod
    def __traverse(root, indent="", transform=View.__str__, stream=sys.stdout, bounds2id={}):
        if not root:
            return

        s = transform(root)
        sub_node = None
        if stream and s:
            ius = "%s%s" % (indent, s if isinstance(s, unicode) else unicode(s, 'utf-8', 'replace'))
            print >> stream, ius.encode('utf-8', 'replace')

            bounds = str(root.getBounds()).replace('((', '[')
            bounds = bounds.replace('))', ']')
            bounds = bounds.replace('), (', '][')
            bounds = bounds.replace(', ', ',')

            # print root.getPositionAndSize(), bounds
            bounds2id[bounds] = root.getId()

        for ch in root.children:
            ViewClientHandler.__traverse(ch, indent=indent + "   ", transform=transform,
                                         stream=stream, bounds2id=bounds2id)
        return sub_node

    @staticmethod
    def dump_view_server(package):
        kwargs1 = {VERBOSE: False, 'ignoresecuredevice': False, 'ignoreversioncheck': False}
        kwargs2 = {ViewClientOptions.FORCE_VIEW_SERVER_USE: False, ViewClientOptions.START_VIEW_SERVER: True,
                   ViewClientOptions.AUTO_DUMP: False, ViewClientOptions.IGNORE_UIAUTOMATOR_KILLED: True,
                   ViewClientOptions.COMPRESSED_DUMP: True,
                   ViewClientOptions.USE_UIAUTOMATOR_HELPER: False,
                   ViewClientOptions.DEBUG: {},
                   }
        kwargs2[ViewClientOptions.FORCE_VIEW_SERVER_USE] = True
        vc = ViewClient(*ViewClient.connectToDeviceOrExit(**kwargs1), **kwargs2)
        options = {WINDOW: -1, SAVE_SCREENSHOT: None, SAVE_VIEW_SCREENSHOTS: None, DO_NOT_DUMP_VIEWS: False,
                   DEVICE_ART: None, DROP_SHADOW: False, SCREEN_GLARE: False}
        windows = vc.list()
        print windows
        transform = MAP['b']
        for window in windows:
            if package not in windows[window]:
                continue
            print windows[window]
            vc.dump(window=int(window))
            # ViewClient.imageDirectory = options[SAVE_VIEW_SCREENSHOTS]
            return ViewClientHandler.traverse(vc, transform=transform)

    @staticmethod
    def fill_ids(xml_data, package):
        '''
        Fill the missing ids caused by uiautomator with low API level (<18)
        :param xml_data:
        :param package:
        :return:
        '''
        dom = parseString(xml_data.encode("utf-8"))
        nodes = dom.getElementsByTagName('node')
        for node in nodes:
            if node.hasAttribute('resource-id'):
                return xml_data
            else:
                break
        bounds2ids = ViewClientHandler.dump_view_server(package)
        if bounds2ids == None:
            ViewClientHandler.logger.error('Cannot identify the package!')
            return xml_data
        ViewClientHandler.logger.info(str(bounds2ids))
        for node in nodes:
            if node.getAttribute('bounds') in bounds2ids:
                node.setAttribute('resource-id', bounds2ids[node.getAttribute('bounds')])
            else:
                ViewClientHandler.logger.warn('Cannot find ' + node.getAttribute('bounds'))
        return dom.toxml()