Beispiel #1
0
def load_csbd_gp_familyfold(fold):
    """Load and return the CSBD dataset for Google play family fold experiment and family fold.
    Parameters
    ----------
    fold : integer, between 0 and 9
        The family fold to load.
    Returns
    -------
    data : dictionary
        Dictionary, the attributes are:
        X : array, shape = [n_samples, n_features]
            A {n_samples by n_samples} size matrix containing data
        y : array, shape = [n_samples]
            Labels (0 = benign, 1 = malware)
        training_indices : array,
            Indices on which the classifier has been trained
        model: string
            filepath containing classification model
    """
    model = "/data/Alex/malware-tools/csbd/GooglePlay/FamilyFold/Old/Fold"+str(fold)+"/modelFold.pkl"
    Corpus = '/data/Alex/malware-tools/csbd/data/'
    FeatureVectorizer = TF(input='filename', lowercase=False, token_pattern=None, tokenizer=MyTokenizer,
                           binary=True, dtype=np.float64)
    data = load_data_gp_familyfold(model, FeatureVectorizer, Corpus, fold, ".txt", 5000)
    return data
def KmeansCluster(TrainMalSet, labelcsv, FeatureOption):
    Logger.debug("Loading Malware and Goodware Sample Data for training and testing")
    TrainMalSamples = getApkList(TrainMalSet, ".data")

    Logger.info("Loaded Samples")

    FeatureVectorizer = TF(input="filename", tokenizer=lambda x: x.split('\n'), token_pattern=None,
                           binary=FeatureOption)
    X = FeatureVectorizer.fit_transform(TrainMalSamples)

    get_family_dict(labelcsv)

    y_train = []
    for file in TrainMalSamples:
        if "amd" in labelcsv:
            sha256 = os.path.split(file)[-1][:-5].split('_')[-1]
        else:
            sha256 = os.path.split(file)[-1][:-5]
        if sha256 in family_dict:
            y_train.append(family_dict[sha256])
        else:
            y_train.append(-1)

    # test
    print(y_train[:20])

    kmeans = KMeans(n_clusters=family_count, random_state=10).fit(X)
    labels = kmeans.labels_
    score = fowlkes_mallows_score(y_train, labels)

    print(labels[:20])

    print(family_count, score)
def vectorize_text(x_train, x_test, feature_count):
    vectorizer = TF(strip_accents='unicode',
                    analyzer='word',
                    max_features=feature_count)

    text_train = vectorizer.fit_transform(x_train).toarray()
    text_test = vectorizer.transform(x_test).toarray()
    text_train, text_test = pd.DataFrame(text_train), pd.DataFrame(text_test)

    return text_train, text_test
Beispiel #4
0
def Classification(MalwareCorpus, GoodwareCorpus, TestSize, FeatureOption, Model, NumTopFeats):
    '''
    Train a classifier for classifying malwares and goodwares using Support Vector Machine technique.
    Compute the prediction accuracy and f1 score of the classifier.
    Modified from Jiachun's code.

    :param String MalwareCorpus: absolute path of the malware corpus
    :param String GoodwareCorpus: absolute path of the goodware corpus
    :param String FeatureOption: tfidf or binary, specify how to construct the feature vector

    :rtype String Report: result report
    '''
    # step 1: creating feature vector
    Logger.debug("Loading Malware and Goodware Sample Data")
    AllMalSamples = CM.ListFiles(MalwareCorpus, ".data")
    # print(len(AllMalSamples))
    AllGoodSamples = CM.ListFiles(GoodwareCorpus, ".data")
    AllSampleNames = AllMalSamples + AllGoodSamples
    Logger.info("Loaded samples")

    FeatureVectorizer = TF(input='filename', tokenizer=lambda x: x.split('\n'), token_pattern=None,
                           binary=FeatureOption)
    x = FeatureVectorizer.fit_transform(AllMalSamples + AllGoodSamples)
    print(x.shape)

    # label malware as 1 and goodware as -1
    # Mal_labels = np.ones(len(AllMalSamples))
    # Good_labels = np.empty(len(AllGoodSamples))
    # Good_labels.fill(-1)
    # y = np.concatenate((Mal_labels, Good_labels), axis=0)
    # Logger.info("Label array - generated")

    acc = []
    pre = []
    reca = []
    f1 = []
    auc = []
    for i in range(1):
    # step 2: split all samples to training set and test set
        x_train_samplenames, x_test_samplenames, y_train, y_test = train_test_split(AllSampleNames, y, test_size=TestSize,random_state=0)
        print("x_train_samplenames--- ")
        x_train = FeatureVectorizer.fit_transform(x_train_samplenames)
        print(type(x_train))
        print(x_train)
def SVMClassification(TrainMalSet, TrainGoodSet, TestMalSet, TestGoodSet, FeatureOption, Model, NumTopFeats):
    '''
    Train a classifier for classifying malwares and goodwares using Support Vector Machine technique.
    Compute the prediction accuracy and f1 score of the classifier.
    Modified from Jiachun's code.

    :param String/List TrainMalSet: absolute path/paths of the malware corpus for trainning set
    :param String/List TrainGoodSet: absolute path/paths of the goodware corpus for trainning set
    :param String/List TestMalSet: absolute path/paths of the malware corpus for test set
    :param String/List TestGoodSet: absolute path/paths of the goodware corpus for test set
    :param String FeatureOption: tfidf or binary, specify how to construct the feature vector
    '''
    # step 1: creating feature vector
    Logger.debug("Loading Malware and Goodware Sample Data for training and testing")
    TrainMalSamples = CM.ListFiles(TrainMalSet, ".data")
    TrainGoodSamples = CM.ListFiles(TrainGoodSet, ".data")
    TestMalSamples = CM.ListFiles(TestMalSet, ".data")
    TestGoodSamples = CM.ListFiles(TestGoodSet, ".data")
    AllTestSamples = TestMalSamples + TestGoodSamples
    Logger.info("Loaded Samples")

    FeatureVectorizer = TF(input="filename", tokenizer=lambda x: x.split('\n'), token_pattern=None,
                           binary=FeatureOption)
    x_train = FeatureVectorizer.fit_transform(TrainMalSamples + TrainGoodSamples)
    x_test = FeatureVectorizer.transform(TestMalSamples + TestGoodSamples)

    # label training sets malware as 1 and goodware as -1
    Train_Mal_labels = np.ones(len(TrainMalSamples))
    Train_Good_labels = np.empty(len(TrainGoodSamples))
    Train_Good_labels.fill(-1)
    y_train = np.concatenate((Train_Mal_labels, Train_Good_labels), axis=0)
    Logger.info("Training Label array - generated")

    # label testing sets malware as 1 and goodware as -1
    Test_Mal_labels = np.ones(len(TestMalSamples))
    Test_Good_labels = np.empty(len(TestGoodSamples))
    Test_Good_labels.fill(-1)
    y_test = np.concatenate((Test_Mal_labels, Test_Good_labels), axis=0)
    Logger.info("Testing Label array - generated")

    # step 2: train the model
    Logger.info("Perform Classification with SVM Model")
    Parameters= {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

    Clf = GridSearchCV(LinearSVC(), Parameters, cv=10, scoring= 'f1', n_jobs=-1 )
    SVMModels= Clf.fit(x_train, y_train)
    y_score = SVMModels.decision_function(x_test)

    fpr, tpr, thresholds = roc_curve(y_test, y_score)
    roc_auc = auc(fpr, tpr)

    print(fpr)
    print(tpr)
    print(roc_auc)

    plt.figure()
    lw = 2
    plt.figure(figsize=(10, 10))
    plt.plot(fpr, tpr, color='darkorange',
             lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)  ###假正率为横坐标,真正率为纵坐标做曲线
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()
Beispiel #6
0
def HoldoutClassification(TrainMalSet, TrainGoodSet, TestMalSet, TestGoodSet,
                          FeatureOption, Model, NumTopFeats):
    '''
    Train a classifier for classifying malwares and goodwares using Support Vector Machine technique.
    Compute the prediction accuracy and f1 score of the classifier.
    Modified from Jiachun's code.

    :param String/List TrainMalSet: absolute path/paths of the malware corpus for trainning set
    :param String/List TrainGoodSet: absolute path/paths of the goodware corpus for trainning set
    :param String/List TestMalSet: absolute path/paths of the malware corpus for test set
    :param String/List TestGoodSet: absolute path/paths of the goodware corpus for test set
    :param String FeatureOption: tfidf or binary, specify how to construct the feature vector
    '''
    # step 1: creating feature vector
    Logger.debug(
        "Loading Malware and Goodware Sample Data for training and testing")
    TrainMalSamples = CM.ListFiles(TrainMalSet, ".data")
    TrainGoodSamples = CM.ListFiles(TrainGoodSet, ".data")
    TestMalSamples = CM.ListFiles(TestMalSet, ".data")
    TestGoodSamples = CM.ListFiles(TestGoodSet, ".data")
    AllTestSamples = TestMalSamples + TestGoodSamples
    Logger.info("Loaded Samples")

    FeatureVectorizer = TF(input="filename",
                           tokenizer=lambda x: x.split('\n'),
                           token_pattern=None,
                           binary=FeatureOption)
    x_train = FeatureVectorizer.fit_transform(TrainMalSamples +
                                              TrainGoodSamples)
    x_test = FeatureVectorizer.transform(TestMalSamples + TestGoodSamples)

    # label training sets malware as 1 and goodware as -1
    Train_Mal_labels = np.ones(len(TrainMalSamples))
    Train_Good_labels = np.empty(len(TrainGoodSamples))
    Train_Good_labels.fill(-1)
    y_train = np.concatenate((Train_Mal_labels, Train_Good_labels), axis=0)
    Logger.info("Training Label array - generated")

    # label testing sets malware as 1 and goodware as -1
    Test_Mal_labels = np.ones(len(TestMalSamples))
    Test_Good_labels = np.empty(len(TestGoodSamples))
    Test_Good_labels.fill(-1)
    y_test = np.concatenate((Test_Mal_labels, Test_Good_labels), axis=0)
    Logger.info("Testing Label array - generated")

    # step 2: train the model
    Logger.info("Perform Classification with SVM Model")
    Parameters = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

    T0 = time.time()
    if not Model:
        Clf = GridSearchCV(LinearSVC(),
                           Parameters,
                           cv=5,
                           scoring='f1',
                           n_jobs=-1)
        SVMModels = Clf.fit(x_train, y_train)
        Logger.info(
            "Processing time to train and find best model with GridSearchCV is %s sec."
            % (round(time.time() - T0, 2)))
        BestModel = SVMModels.best_estimator_
        Logger.info("Best Model Selected : {}".format(BestModel))
        TrainingTime = round(time.time() - T0, 2)
        print "The training time for random split classification is %s sec." % (
            TrainingTime)
        print "Enter a filename to save the model:"
        filename = raw_input()
        dump(Clf, filename + ".pkl")
    else:
        SVMModels = load(Model)
        BestModel = SVMModels.best_estimator_
        TrainingTime = 0

    # step 4: Evaluate the best model on test set
    y_pred = SVMModels.predict(x_test)
    TestingTime = round(time.time() - TrainingTime - T0, 2)
    Accuracy = accuracy_score(y_test,
                              y_pred)  # Return (x1 == x2) element-wise.
    print "Test Set Accuracy = ", Accuracy
    print(
        metrics.classification_report(y_test,
                                      y_pred,
                                      labels=[1, -1],
                                      target_names=['Malware', 'Goodware']))
    Report = "Test Set Accuracy = " + str(
        Accuracy
    ) + "\n" + metrics.classification_report(
        y_test, y_pred, labels=[1, -1], target_names=['Malware', 'Goodware'])
    # pointwise multiplication between weight and feature vect
    w = BestModel.coef_
    w = w[0].tolist()
    v = x_test.toarray()
    vocab = FeatureVectorizer.get_feature_names()
    explanations = {os.path.basename(s): {} for s in AllTestSamples}
    for i in range(v.shape[0]):
        wx = v[i, :] * w
        wv_vocab = zip(wx, vocab)
        if y_pred[i] == 1:
            wv_vocab.sort(reverse=True)
            # print "pred: {}, org: {}".format(y_pred[i],y_test[i])
            # pprint(wv_vocab[:10])
            explanations[os.path.basename(
                AllTestSamples[i])]['top_features'] = wv_vocab[:NumTopFeats]
        elif y_pred[i] == -1:
            wv_vocab.sort()
            # print "pred: {}, org: {}".format(y_pred[i],y_test[i])
            # pprint(wv_vocab[-10:])
            explanations[os.path.basename(
                AllTestSamples[i])]['top_features'] = wv_vocab[-NumTopFeats:]
        explanations[os.path.basename(
            AllTestSamples[i])]['original_label'] = y_test[i]
        explanations[os.path.basename(
            AllTestSamples[i])]['predicted_label'] = y_pred[i]

    with open('explanations_HC.json', 'w') as FH:
        json.dump(explanations, FH, indent=4)

    return y_train, y_test, y_pred, TrainingTime, TestingTime
Beispiel #7
0
def select_optimal_num_features_load_drebin_gp_familyfold(fold):
    model = "../new_models/Drebin/Alternate/GooglePlay/Fold"+str(fold)+"/modelFoldLinearSVM.pkl"
    Corpus = '/data/Alex/malware-tools/Drebin/data/'
    FeatureVectorizer = TF(input='filename', tokenizer=lambda x: x.split('\n'), token_pattern=None, binary=True)
    return load_data_gp_familyfold(model, FeatureVectorizer, Corpus, fold, ".data", -1)
Beispiel #8
0
def load_drebin_vt_familyfold(fold):
    model = "/data/Alex/malware-tools/Drebin/VTFamilyFold/Fold"+str(fold+1)+"/modelFoldLinearSVM.pkl"
    Corpus = '/data/Alex/malware-tools/Drebin/data/'
    FeatureVectorizer = TF(input='filename', tokenizer=lambda x: x.split('\n'), token_pattern=None, binary=True)
    return load_data_vt_familyfold(model, FeatureVectorizer, Corpus, fold, ".data", -1)
Beispiel #9
0
def load_drebin_gp_test_on_vt():
    model = "/data/Alex/malware-tools/Drebin/GPTestAllVT/modelLinear.pkl"
    Corpus = '/data/Alex/malware-tools/Drebin/data/'
    FeatureVectorizer = TF(input='filename', tokenizer=lambda x: x.split('\n'), token_pattern=None, binary=True)
    return load_data_gp_test_on_vt(model, FeatureVectorizer, Corpus, ".data", -1)
Beispiel #10
0
def load_csbd_gp_test_on_vt():
    model = "/data/Alex/malware-tools/csbd/GPTestOnVTAll/model.pkl"
    Corpus = '/data/Alex/malware-tools/csbd/data/'
    FeatureVectorizer = TF(input='filename', lowercase=False, token_pattern=None, tokenizer=MyTokenizer,
                           binary=True, dtype=np.float64)
    return load_data_gp_test_on_vt(model, FeatureVectorizer, Corpus, ".txt", 5000)
Beispiel #11
0
target_name = tr_data.target_names
from sklearn.svm import SVC
# def feature_work(data=None,vb=None,stop_words=None,max_df=1):
#     cv=CV(stop_words=stop_words,max_df=max_df,vocabulary=vb)
#     #print(cv.vocabulary)
#     tr_vb=cv.vocabulary_
#
#     tf=TF()
#     tf_idf=tf.fit_transform(cv.fit_transform(data))#词频和tfidf值
#     print('0:',cv.fit_transform(data).shape)
#     print('1:', tf_idf.shape)
#     #word=cv.get_feature_names()#词文本的关键字
#     #weight=tf_idf.toarray()
#     return tr_vb,tf_idf
cv = CV(stop_words='english', max_df=0.8)
tf = TF()
tr_idf = tf.fit_transform(cv.fit_transform(tr_data_x))  #词频和tfidf值
print('0:', cv.fit_transform(tr_data_x).shape)
te_idf = tf.fit_transform(cv.fit_transform(te_data_x))  #词频和tfidf值
print('1:', cv.fit_transform(te_data_x).shape)
#train feature tf_tr是训练输入从tr_data_x处理得来,tr_data_y训练目标没有修改
#tr_vb,tf_tr=feature_work(tr_data_x,stop_words='english',max_df=0.5)
#test feature
#te_vb,tf_te=feature_work(te_data_x,vb=tr_vb)


def getaccuracy(model=None, x=None, y_test=None, tar_name=None):

    y_pre = model.predict(x)
    print(classification_report(y_test, y_pre, target_names=tar_name))
    print(accuracy_score(y_test, y_pre))
Beispiel #12
0
    # Load Data
    DATA_PATH = "data.csv"
    df = pd.read_csv(open(DATA_PATH), encoding="utf-8").dropna()

    # Clean
    df["Title"] = df["Title"].apply(lambda x: clean(x))
    df["Text"] = df["Text"].apply(lambda x: clean(x))

    # UNDERSAMPLE
    train_df = sampler(df)

    # Feature Extractor
    cv = TF(
        sublinear_tf=True,
        min_df=10,
        encoding="latin-1",
        ngram_range=(1, 1),
        stop_words="english",
        analyzer="word",
        max_features=6150,
    )
    cv.fit(df["Title"] + " " + df["Text"])
    pickle.dump(cv, open(VEC_PATH, "wb"))

    # Build Model
    model = MNB()
    train_data = hstack(
        (cv.transform(train_df["Title"]), cv.transform(train_df["Text"])))
    model.fit(train_data, train_df["Flair"])
    pickle.dump(model, open(MODEL_PATH, "wb"))
Beispiel #13
0
    return cleaned_text


train_df = pd.read_json('./data/train.json', encoding='utf-8')
print(train_df['review_text'])
test_df = pd.read_json('./data/test.json', encoding='utf-8')

train_df['text'] = train_df['review_text'].apply(lambda x: clean_text(x))
test_df['text'] = test_df['review_text'].apply(lambda x: clean_text(x))
print(train_df['text'].head())
train_df['is_spoiler'] = train_df['is_spoiler'].apply(lambda x: 1 if x else 0)

train_df = train_df.sample(frac=1).reset_index(drop=True)
tfidf = TF(analyzer="word",
           tokenizer=None,
           preprocessor=None,
           stop_words=None,
           max_features=200)

print("Creating the tfidf vector...\n")
tfidf.fit(train_df['text'])
x_train = tfidf.transform(train_df['text'])
x_train = x_train.toarray()

x_test = tfidf.transform(test_df['text'])
x_test = x_test.toarray()

print(x_train.shape)
print(x_test.shape)

y_train = train_df['is_spoiler']
Beispiel #14
0
def RandomClassification(MalwareCorpus, GoodwareCorpus, TestSize, NumFeaturesToBeSelected, FeatureOption):
    '''
    Train a classifier for classifying malwares and goodwares using Random Forest technique
    Compute the prediction accuracy and f1 score of the classifier

    :param String MalwareCorpus: absolute path of the malware corpus
    :param String GoodwareCorpus: absolute path of the goodware corpus
    :param Float TestSize: test set split (default is 0.3 for testing and 0.7 for training)
    :param integer NumFeaturesToBeSelected: number of top features to select
    :param Boolean FeatureOption: False
    '''

    # Step 1: Getting the malware and goodware txt files
    Logger.debug ("Loading positive and negative samples")
    AllMalSamples = glob.glob(os.path.join(MalwareCorpus,'*txt'))
    AllGoodSamples = glob.glob(os.path.join(GoodwareCorpus,'*txt'))
    Logger.info ("All Samples loaded")

    # Step 2: Creating feature vector
    FeatureVectorizer = TF(input='filename', lowercase=False, token_pattern=None,
                           tokenizer=MyTokenizer, binary=FeatureOption, dtype=np.float64)
    X = FeatureVectorizer.fit_transform(AllMalSamples + AllGoodSamples)

    # Label malware as 1 and goodware as -1
    MalLabels = np.ones(len(AllMalSamples))
    GoodLabels = np.empty(len(AllGoodSamples))
    GoodLabels.fill(-1)
    Y = np.concatenate((MalLabels, GoodLabels), axis=0)
    Logger.info("Label array - generated")

    # Step 3: Split all samples into training and test set
    XTrain, XTest, YTrain, YTest = train_test_split(X, Y,
                                                        test_size=TestSize, random_state=randint(0,100))
    Logger.debug ("Test set split = %s", TestSize)

    Features = FeatureVectorizer.get_feature_names()
    Logger.info ("Total number of features: {} ".format(len(Features)))

    if len(Features) > NumFeaturesToBeSelected:
        #with feature selection
        Logger.info ("Gonna select %s features", NumFeaturesToBeSelected)
        FSAlgo = SelectKBest(chi2, k = NumFeaturesToBeSelected)

        XTrain = FSAlgo.fit_transform(XTrain, YTrain)
        XTest = FSAlgo.transform(XTest)
        

    Logger.info ("Gonna perform classification with C-RandomForest")


    # Step 4: model selection through cross validation
    # Assuming RandomForest is the only classifier we are gonna try, we will set the n_estimators parameter as follows.
    Parameters = {'n_estimators': [10,50,100,200,500,1000],
                  'bootstrap': [True, False],
                  'criterion': ['gini', 'entropy']}
    Clf = GridSearchCV(RandomForestClassifier(), Parameters,  cv=5, scoring = 'f1', n_jobs=-1)
    RFmodels = Clf.fit(XTrain, YTrain)
    BestModel = RFmodels.best_estimator_
    Logger.info('CV done - Best model selected: {}'.format(BestModel))
    # Best model is chosen through 5-fold cross validation and stored in the variable: RFmodels

    # Step 5: Evaluate the best model on test set
    YPred = RFmodels.predict(XTest)
    Accuracy = accuracy_score(YTest, YPred)
    print "Test Set Accuracy = ", Accuracy
    print(metrics.classification_report(YTest, YPred,  labels=[1, -1], target_names=['Malware', 'Goodware']))
Beispiel #15
0
def load_csbd_vt_familyfold(fold):
    model = "/data/Alex/malware-tools/csbd/VTFamilyFold/Fold"+str(fold+1)+"/model.pkl"
    Corpus = '/data/Alex/malware-tools/csbd/data/'
    FeatureVectorizer = TF(input='filename', lowercase=False, token_pattern=None, tokenizer=MyTokenizer,
                           binary=True, dtype=np.float64)
    return load_data_vt_familyfold(model, FeatureVectorizer, Corpus, fold, ".txt", 5000)
def RandomClassification(MalwareCorpus, GoodwareCorpus, TestSize,
                         FeatureOption, Model, NumTopFeats):
    '''
    Train a classifier for classifying malwares and goodwares using Support Vector Machine technique.
    Compute the prediction accuracy and f1 score of the classifier.
    Modified from Jiachun's code.

    :param String MalwareCorpus: absolute path of the malware corpus
    :param String GoodwareCorpus: absolute path of the goodware corpus
    :param String FeatureOption: tfidf or binary, specify how to construct the feature vector

    :rtype String Report: result report
    '''
    # step 1: creating feature vector
    Logger.debug("Loading Malware and Goodware Sample Data")
    AllMalSamples = CM.ListFiles(MalwareCorpus, ".data")
    AllGoodSamples = CM.ListFiles(GoodwareCorpus, ".data")
    AllSampleNames = AllMalSamples + AllGoodSamples
    Logger.info("Loaded samples")

    FeatureVectorizer = TF(input='filename',
                           tokenizer=lambda x: x.split('\n'),
                           token_pattern=None,
                           binary=FeatureOption)
    x = FeatureVectorizer.fit_transform(AllMalSamples + AllGoodSamples)

    # label malware as 1 and goodware as -1
    Mal_labels = np.ones(len(AllMalSamples))
    Good_labels = np.empty(len(AllGoodSamples))
    Good_labels.fill(-1)
    y = np.concatenate((Mal_labels, Good_labels), axis=0)
    Logger.info("Label array - generated")

    # step 2: split all samples to training set and test set
    x_train_samplenames, x_test_samplenames, y_train, y_test = train_test_split(
        AllSampleNames,
        y,
        test_size=TestSize,
        random_state=random.randint(0, 100))
    #x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=TestSize,
    #                                             random_state=random.randint(0, 100))
    x_train = FeatureVectorizer.fit_transform(x_train_samplenames)
    x_test = FeatureVectorizer.transform(x_test_samplenames)
    Logger.debug("Test set split = %s", TestSize)
    Logger.info("train-test split done")

    # step 3: train the model
    Logger.info("Perform Classification with SVM Model")
    Parameters = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

    T0 = time.time()
    if not Model:
        Clf = GridSearchCV(LinearSVC(),
                           Parameters,
                           cv=5,
                           scoring='f1',
                           n_jobs=-1)
        SVMModels = Clf.fit(x_train, y_train)
        Logger.info(
            "Processing time to train and find best model with GridSearchCV is %s sec."
            % (round(time.time() - T0, 2)))
        BestModel = SVMModels.best_estimator_
        Logger.info("Best Model Selected : {}".format(BestModel))
        print "The training time for random split classification is %s sec." % (
            round(time.time() - T0, 2))
        print "Enter a filename to save the model:"
        filename = raw_input()
        dump(Clf, filename + ".pkl")
    else:
        SVMModels = load(Model)
        BestModel = SVMModels.best_estimator

    # step 4: Evaluate the best model on test set
    T0 = time.time()
    y_pred = SVMModels.predict(x_test)
    print "The testing time for random split classification is %s sec." % (
        round(time.time() - T0, 2))
    Accuracy = accuracy_score(y_test, y_pred)
    print "Test Set Accuracy = {}".format(Accuracy)
    print(
        metrics.classification_report(y_test,
                                      y_pred,
                                      labels=[1, -1],
                                      target_names=['Malware', 'Goodware']))
    Report = "Test Set Accuracy = " + str(
        Accuracy
    ) + "\n" + metrics.classification_report(
        y_test, y_pred, labels=[1, -1], target_names=['Malware', 'Goodware'])
    # pointwise multiplication between weight and feature vect
    w = BestModel.coef_
    w = w[0].tolist()
    v = x_test.toarray()
    vocab = FeatureVectorizer.get_feature_names()
    explanations = {os.path.basename(s): {} for s in x_test_samplenames}
    for i in range(v.shape[0]):
        wx = v[i, :] * w
        wv_vocab = zip(wx, vocab)
        if y_pred[i] == 1:
            wv_vocab.sort(reverse=True)
            #print "pred: {}, org: {}".format(y_pred[i],y_test[i])
            #pprint(wv_vocab[:10])
            explanations[os.path.basename(
                x_test_samplenames[i]
            )]['top_features'] = wv_vocab[:NumTopFeats]
        elif y_pred[i] == -1:
            wv_vocab.sort()
            #print "pred: {}, org: {}".format(y_pred[i],y_test[i])
            #pprint(wv_vocab[-10:])
            explanations[os.path.basename(
                x_test_samplenames[i]
            )]['top_features'] = wv_vocab[-NumTopFeats:]
        explanations[os.path.basename(
            x_test_samplenames[i])]['original_label'] = y_test[i]
        explanations[os.path.basename(
            x_test_samplenames[i])]['predicted_label'] = y_pred[i]

    with open('explanations_RC.json', 'w') as FH:
        json.dump(explanations, FH, indent=4)

    # return TestLabels, PredictedLabels
    return Report
Beispiel #17
0
def KmeansCluster(TrainMalSet, labelcsv, family10csv, FeatureOption):
    Logger.debug("Loading Malware and Goodware Sample Data for training and testing")
    TrainMalSamples = getApkList(TrainMalSet, ".data")

    Logger.info("Loaded Samples")

    FeatureVectorizer = TF(input="filename", tokenizer=lambda x: x.split('\n'), token_pattern=None,
                           binary=FeatureOption)
    X = FeatureVectorizer.fit_transform(TrainMalSamples)

    get_family_dict(labelcsv, family10csv)

    y_train = []
    for file in TrainMalSamples:
        if "amd" in labelcsv:
            sha256 = os.path.split(file)[-1][:-5].split('_')[-1]
        else:
            sha256 = os.path.split(file)[-1][:-5]
        if sha256 in family_dict:
            y_train.append(family_dict[sha256])
        else:
            y_train.append(-1)

    # test
    # print(y_train[:20])

    kmeans = KMeans(n_clusters=family_count, random_state=10).fit(X)
    labels = kmeans.labels_
    score = fowlkes_mallows_score(y_train, labels)

    #print(labels[:20])

    print(family_count, score)

    s_train = {}
    s_clus = {}
    for i in range(family_count):
        s_train[i] = []
        for j in range(len(y_train)):
            if str(y_train[j]) == str(i):
                # print("y_train_j and i", y_train[j], i)
                s_train[i].append(j)  # j is the num. of sample, should be long

    print(s_train)

    for i in range(family_count):
        s_clus[i] = []
        for j in range(len(labels)):
            if str(labels[j]) == str(i):
                # print("label_j and i", y_train[j], i)
                s_clus[i].append(j)

    print(s_clus)

    label_list = [0] * 10  # index is y_train, value is labels


    jac_dict = {}
    for i in range(family_count):
        max_jac = -1
        jac_dict[i] = [0] * family_count
        for j in range(family_count):
            jac_v = jaccard_sim(s_clus[i], s_train[j])
            jac_dict[i][j] = jac_v
            if jac_v > max_jac:
                max_jac = jac_v
                label_list[i] = j


    print("label_list is ", label_list)

    clus_other_fam = {}
    clus_not_in = {}

    for clus_num in range(family_count):
        clus_other_fam[clus_num] = []
        clus_not_in[clus_num] = []
        fam_num = label_list[clus_num]
        # find sample should in s_clus[clus_num] but in other
        for item in s_train[fam_num]:  # item is the index(num) of sample
            if item not in s_clus[clus_num]:
                clus_other_fam[clus_num].append(label_list[labels[item]])  # chuqu other family num

        for item in s_clus[clus_num]:
            if item not in s_train[fam_num]:
                clus_not_in[clus_num].append(y_train[item])  # not this family but jinlai


    print("jac matrix: ")
    print("classnum, jac_matrix, most_possi_fam, out_fam, out_fam_num, in_fam, in_fam_num, should in, "
          "actual in, rate1, rate2")
    for clus_num in range(family_count):
        fam_num = label_list[clus_num]
        family_name_str = familyselect[fam_num]

        fam_err_chuqu = [0] * family_count
        fam_err_jinlai = [0] * family_count

        for item in clus_other_fam[clus_num]:
            fam_err_chuqu[item] += 1

        for item in clus_not_in[clus_num]:
            fam_err_jinlai[item] += 1

        chuqu_err_max = max(fam_err_chuqu)
        # chuqu / bengai youde
        bili1 = format(float(chuqu_err_max) / float(len(s_train[fam_num])), '.4f')

        jinlai_err_max = max(fam_err_jinlai)
        # jinlai / shijifenguolai
        bili2 = format(float(jinlai_err_max) / float(len(s_clus[clus_num])), '.4f')


        chuqu_fam = fam_err_chuqu.index(chuqu_err_max)
        jilai_fam = fam_err_jinlai.index(jinlai_err_max)

        out_fam_name = familyselect[chuqu_fam]
        if chuqu_err_max == 0:
            out_fam_name = ""
        in_fam_name = familyselect[jilai_fam]
        if jinlai_err_max == 0:
            in_fam_name = ""

        print(clus_num, jac_dict[clus_num], family_name_str, out_fam_name, chuqu_err_max, in_fam_name
              , jinlai_err_max, len(s_train[fam_num]), len(s_clus[clus_num]), fam_err_chuqu, fam_err_jinlai)