def load_csbd_gp_familyfold(fold): """Load and return the CSBD dataset for Google play family fold experiment and family fold. Parameters ---------- fold : integer, between 0 and 9 The family fold to load. Returns ------- data : dictionary Dictionary, the attributes are: X : array, shape = [n_samples, n_features] A {n_samples by n_samples} size matrix containing data y : array, shape = [n_samples] Labels (0 = benign, 1 = malware) training_indices : array, Indices on which the classifier has been trained model: string filepath containing classification model """ model = "/data/Alex/malware-tools/csbd/GooglePlay/FamilyFold/Old/Fold"+str(fold)+"/modelFold.pkl" Corpus = '/data/Alex/malware-tools/csbd/data/' FeatureVectorizer = TF(input='filename', lowercase=False, token_pattern=None, tokenizer=MyTokenizer, binary=True, dtype=np.float64) data = load_data_gp_familyfold(model, FeatureVectorizer, Corpus, fold, ".txt", 5000) return data
def KmeansCluster(TrainMalSet, labelcsv, FeatureOption): Logger.debug("Loading Malware and Goodware Sample Data for training and testing") TrainMalSamples = getApkList(TrainMalSet, ".data") Logger.info("Loaded Samples") FeatureVectorizer = TF(input="filename", tokenizer=lambda x: x.split('\n'), token_pattern=None, binary=FeatureOption) X = FeatureVectorizer.fit_transform(TrainMalSamples) get_family_dict(labelcsv) y_train = [] for file in TrainMalSamples: if "amd" in labelcsv: sha256 = os.path.split(file)[-1][:-5].split('_')[-1] else: sha256 = os.path.split(file)[-1][:-5] if sha256 in family_dict: y_train.append(family_dict[sha256]) else: y_train.append(-1) # test print(y_train[:20]) kmeans = KMeans(n_clusters=family_count, random_state=10).fit(X) labels = kmeans.labels_ score = fowlkes_mallows_score(y_train, labels) print(labels[:20]) print(family_count, score)
def vectorize_text(x_train, x_test, feature_count): vectorizer = TF(strip_accents='unicode', analyzer='word', max_features=feature_count) text_train = vectorizer.fit_transform(x_train).toarray() text_test = vectorizer.transform(x_test).toarray() text_train, text_test = pd.DataFrame(text_train), pd.DataFrame(text_test) return text_train, text_test
def Classification(MalwareCorpus, GoodwareCorpus, TestSize, FeatureOption, Model, NumTopFeats): ''' Train a classifier for classifying malwares and goodwares using Support Vector Machine technique. Compute the prediction accuracy and f1 score of the classifier. Modified from Jiachun's code. :param String MalwareCorpus: absolute path of the malware corpus :param String GoodwareCorpus: absolute path of the goodware corpus :param String FeatureOption: tfidf or binary, specify how to construct the feature vector :rtype String Report: result report ''' # step 1: creating feature vector Logger.debug("Loading Malware and Goodware Sample Data") AllMalSamples = CM.ListFiles(MalwareCorpus, ".data") # print(len(AllMalSamples)) AllGoodSamples = CM.ListFiles(GoodwareCorpus, ".data") AllSampleNames = AllMalSamples + AllGoodSamples Logger.info("Loaded samples") FeatureVectorizer = TF(input='filename', tokenizer=lambda x: x.split('\n'), token_pattern=None, binary=FeatureOption) x = FeatureVectorizer.fit_transform(AllMalSamples + AllGoodSamples) print(x.shape) # label malware as 1 and goodware as -1 # Mal_labels = np.ones(len(AllMalSamples)) # Good_labels = np.empty(len(AllGoodSamples)) # Good_labels.fill(-1) # y = np.concatenate((Mal_labels, Good_labels), axis=0) # Logger.info("Label array - generated") acc = [] pre = [] reca = [] f1 = [] auc = [] for i in range(1): # step 2: split all samples to training set and test set x_train_samplenames, x_test_samplenames, y_train, y_test = train_test_split(AllSampleNames, y, test_size=TestSize,random_state=0) print("x_train_samplenames--- ") x_train = FeatureVectorizer.fit_transform(x_train_samplenames) print(type(x_train)) print(x_train)
def SVMClassification(TrainMalSet, TrainGoodSet, TestMalSet, TestGoodSet, FeatureOption, Model, NumTopFeats): ''' Train a classifier for classifying malwares and goodwares using Support Vector Machine technique. Compute the prediction accuracy and f1 score of the classifier. Modified from Jiachun's code. :param String/List TrainMalSet: absolute path/paths of the malware corpus for trainning set :param String/List TrainGoodSet: absolute path/paths of the goodware corpus for trainning set :param String/List TestMalSet: absolute path/paths of the malware corpus for test set :param String/List TestGoodSet: absolute path/paths of the goodware corpus for test set :param String FeatureOption: tfidf or binary, specify how to construct the feature vector ''' # step 1: creating feature vector Logger.debug("Loading Malware and Goodware Sample Data for training and testing") TrainMalSamples = CM.ListFiles(TrainMalSet, ".data") TrainGoodSamples = CM.ListFiles(TrainGoodSet, ".data") TestMalSamples = CM.ListFiles(TestMalSet, ".data") TestGoodSamples = CM.ListFiles(TestGoodSet, ".data") AllTestSamples = TestMalSamples + TestGoodSamples Logger.info("Loaded Samples") FeatureVectorizer = TF(input="filename", tokenizer=lambda x: x.split('\n'), token_pattern=None, binary=FeatureOption) x_train = FeatureVectorizer.fit_transform(TrainMalSamples + TrainGoodSamples) x_test = FeatureVectorizer.transform(TestMalSamples + TestGoodSamples) # label training sets malware as 1 and goodware as -1 Train_Mal_labels = np.ones(len(TrainMalSamples)) Train_Good_labels = np.empty(len(TrainGoodSamples)) Train_Good_labels.fill(-1) y_train = np.concatenate((Train_Mal_labels, Train_Good_labels), axis=0) Logger.info("Training Label array - generated") # label testing sets malware as 1 and goodware as -1 Test_Mal_labels = np.ones(len(TestMalSamples)) Test_Good_labels = np.empty(len(TestGoodSamples)) Test_Good_labels.fill(-1) y_test = np.concatenate((Test_Mal_labels, Test_Good_labels), axis=0) Logger.info("Testing Label array - generated") # step 2: train the model Logger.info("Perform Classification with SVM Model") Parameters= {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]} Clf = GridSearchCV(LinearSVC(), Parameters, cv=10, scoring= 'f1', n_jobs=-1 ) SVMModels= Clf.fit(x_train, y_train) y_score = SVMModels.decision_function(x_test) fpr, tpr, thresholds = roc_curve(y_test, y_score) roc_auc = auc(fpr, tpr) print(fpr) print(tpr) print(roc_auc) plt.figure() lw = 2 plt.figure(figsize=(10, 10)) plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) ###假正率为横坐标,真正率为纵坐标做曲线 plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic example') plt.legend(loc="lower right") plt.show()
def HoldoutClassification(TrainMalSet, TrainGoodSet, TestMalSet, TestGoodSet, FeatureOption, Model, NumTopFeats): ''' Train a classifier for classifying malwares and goodwares using Support Vector Machine technique. Compute the prediction accuracy and f1 score of the classifier. Modified from Jiachun's code. :param String/List TrainMalSet: absolute path/paths of the malware corpus for trainning set :param String/List TrainGoodSet: absolute path/paths of the goodware corpus for trainning set :param String/List TestMalSet: absolute path/paths of the malware corpus for test set :param String/List TestGoodSet: absolute path/paths of the goodware corpus for test set :param String FeatureOption: tfidf or binary, specify how to construct the feature vector ''' # step 1: creating feature vector Logger.debug( "Loading Malware and Goodware Sample Data for training and testing") TrainMalSamples = CM.ListFiles(TrainMalSet, ".data") TrainGoodSamples = CM.ListFiles(TrainGoodSet, ".data") TestMalSamples = CM.ListFiles(TestMalSet, ".data") TestGoodSamples = CM.ListFiles(TestGoodSet, ".data") AllTestSamples = TestMalSamples + TestGoodSamples Logger.info("Loaded Samples") FeatureVectorizer = TF(input="filename", tokenizer=lambda x: x.split('\n'), token_pattern=None, binary=FeatureOption) x_train = FeatureVectorizer.fit_transform(TrainMalSamples + TrainGoodSamples) x_test = FeatureVectorizer.transform(TestMalSamples + TestGoodSamples) # label training sets malware as 1 and goodware as -1 Train_Mal_labels = np.ones(len(TrainMalSamples)) Train_Good_labels = np.empty(len(TrainGoodSamples)) Train_Good_labels.fill(-1) y_train = np.concatenate((Train_Mal_labels, Train_Good_labels), axis=0) Logger.info("Training Label array - generated") # label testing sets malware as 1 and goodware as -1 Test_Mal_labels = np.ones(len(TestMalSamples)) Test_Good_labels = np.empty(len(TestGoodSamples)) Test_Good_labels.fill(-1) y_test = np.concatenate((Test_Mal_labels, Test_Good_labels), axis=0) Logger.info("Testing Label array - generated") # step 2: train the model Logger.info("Perform Classification with SVM Model") Parameters = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]} T0 = time.time() if not Model: Clf = GridSearchCV(LinearSVC(), Parameters, cv=5, scoring='f1', n_jobs=-1) SVMModels = Clf.fit(x_train, y_train) Logger.info( "Processing time to train and find best model with GridSearchCV is %s sec." % (round(time.time() - T0, 2))) BestModel = SVMModels.best_estimator_ Logger.info("Best Model Selected : {}".format(BestModel)) TrainingTime = round(time.time() - T0, 2) print "The training time for random split classification is %s sec." % ( TrainingTime) print "Enter a filename to save the model:" filename = raw_input() dump(Clf, filename + ".pkl") else: SVMModels = load(Model) BestModel = SVMModels.best_estimator_ TrainingTime = 0 # step 4: Evaluate the best model on test set y_pred = SVMModels.predict(x_test) TestingTime = round(time.time() - TrainingTime - T0, 2) Accuracy = accuracy_score(y_test, y_pred) # Return (x1 == x2) element-wise. print "Test Set Accuracy = ", Accuracy print( metrics.classification_report(y_test, y_pred, labels=[1, -1], target_names=['Malware', 'Goodware'])) Report = "Test Set Accuracy = " + str( Accuracy ) + "\n" + metrics.classification_report( y_test, y_pred, labels=[1, -1], target_names=['Malware', 'Goodware']) # pointwise multiplication between weight and feature vect w = BestModel.coef_ w = w[0].tolist() v = x_test.toarray() vocab = FeatureVectorizer.get_feature_names() explanations = {os.path.basename(s): {} for s in AllTestSamples} for i in range(v.shape[0]): wx = v[i, :] * w wv_vocab = zip(wx, vocab) if y_pred[i] == 1: wv_vocab.sort(reverse=True) # print "pred: {}, org: {}".format(y_pred[i],y_test[i]) # pprint(wv_vocab[:10]) explanations[os.path.basename( AllTestSamples[i])]['top_features'] = wv_vocab[:NumTopFeats] elif y_pred[i] == -1: wv_vocab.sort() # print "pred: {}, org: {}".format(y_pred[i],y_test[i]) # pprint(wv_vocab[-10:]) explanations[os.path.basename( AllTestSamples[i])]['top_features'] = wv_vocab[-NumTopFeats:] explanations[os.path.basename( AllTestSamples[i])]['original_label'] = y_test[i] explanations[os.path.basename( AllTestSamples[i])]['predicted_label'] = y_pred[i] with open('explanations_HC.json', 'w') as FH: json.dump(explanations, FH, indent=4) return y_train, y_test, y_pred, TrainingTime, TestingTime
def select_optimal_num_features_load_drebin_gp_familyfold(fold): model = "../new_models/Drebin/Alternate/GooglePlay/Fold"+str(fold)+"/modelFoldLinearSVM.pkl" Corpus = '/data/Alex/malware-tools/Drebin/data/' FeatureVectorizer = TF(input='filename', tokenizer=lambda x: x.split('\n'), token_pattern=None, binary=True) return load_data_gp_familyfold(model, FeatureVectorizer, Corpus, fold, ".data", -1)
def load_drebin_vt_familyfold(fold): model = "/data/Alex/malware-tools/Drebin/VTFamilyFold/Fold"+str(fold+1)+"/modelFoldLinearSVM.pkl" Corpus = '/data/Alex/malware-tools/Drebin/data/' FeatureVectorizer = TF(input='filename', tokenizer=lambda x: x.split('\n'), token_pattern=None, binary=True) return load_data_vt_familyfold(model, FeatureVectorizer, Corpus, fold, ".data", -1)
def load_drebin_gp_test_on_vt(): model = "/data/Alex/malware-tools/Drebin/GPTestAllVT/modelLinear.pkl" Corpus = '/data/Alex/malware-tools/Drebin/data/' FeatureVectorizer = TF(input='filename', tokenizer=lambda x: x.split('\n'), token_pattern=None, binary=True) return load_data_gp_test_on_vt(model, FeatureVectorizer, Corpus, ".data", -1)
def load_csbd_gp_test_on_vt(): model = "/data/Alex/malware-tools/csbd/GPTestOnVTAll/model.pkl" Corpus = '/data/Alex/malware-tools/csbd/data/' FeatureVectorizer = TF(input='filename', lowercase=False, token_pattern=None, tokenizer=MyTokenizer, binary=True, dtype=np.float64) return load_data_gp_test_on_vt(model, FeatureVectorizer, Corpus, ".txt", 5000)
target_name = tr_data.target_names from sklearn.svm import SVC # def feature_work(data=None,vb=None,stop_words=None,max_df=1): # cv=CV(stop_words=stop_words,max_df=max_df,vocabulary=vb) # #print(cv.vocabulary) # tr_vb=cv.vocabulary_ # # tf=TF() # tf_idf=tf.fit_transform(cv.fit_transform(data))#词频和tfidf值 # print('0:',cv.fit_transform(data).shape) # print('1:', tf_idf.shape) # #word=cv.get_feature_names()#词文本的关键字 # #weight=tf_idf.toarray() # return tr_vb,tf_idf cv = CV(stop_words='english', max_df=0.8) tf = TF() tr_idf = tf.fit_transform(cv.fit_transform(tr_data_x)) #词频和tfidf值 print('0:', cv.fit_transform(tr_data_x).shape) te_idf = tf.fit_transform(cv.fit_transform(te_data_x)) #词频和tfidf值 print('1:', cv.fit_transform(te_data_x).shape) #train feature tf_tr是训练输入从tr_data_x处理得来,tr_data_y训练目标没有修改 #tr_vb,tf_tr=feature_work(tr_data_x,stop_words='english',max_df=0.5) #test feature #te_vb,tf_te=feature_work(te_data_x,vb=tr_vb) def getaccuracy(model=None, x=None, y_test=None, tar_name=None): y_pre = model.predict(x) print(classification_report(y_test, y_pre, target_names=tar_name)) print(accuracy_score(y_test, y_pre))
# Load Data DATA_PATH = "data.csv" df = pd.read_csv(open(DATA_PATH), encoding="utf-8").dropna() # Clean df["Title"] = df["Title"].apply(lambda x: clean(x)) df["Text"] = df["Text"].apply(lambda x: clean(x)) # UNDERSAMPLE train_df = sampler(df) # Feature Extractor cv = TF( sublinear_tf=True, min_df=10, encoding="latin-1", ngram_range=(1, 1), stop_words="english", analyzer="word", max_features=6150, ) cv.fit(df["Title"] + " " + df["Text"]) pickle.dump(cv, open(VEC_PATH, "wb")) # Build Model model = MNB() train_data = hstack( (cv.transform(train_df["Title"]), cv.transform(train_df["Text"]))) model.fit(train_data, train_df["Flair"]) pickle.dump(model, open(MODEL_PATH, "wb"))
return cleaned_text train_df = pd.read_json('./data/train.json', encoding='utf-8') print(train_df['review_text']) test_df = pd.read_json('./data/test.json', encoding='utf-8') train_df['text'] = train_df['review_text'].apply(lambda x: clean_text(x)) test_df['text'] = test_df['review_text'].apply(lambda x: clean_text(x)) print(train_df['text'].head()) train_df['is_spoiler'] = train_df['is_spoiler'].apply(lambda x: 1 if x else 0) train_df = train_df.sample(frac=1).reset_index(drop=True) tfidf = TF(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None, max_features=200) print("Creating the tfidf vector...\n") tfidf.fit(train_df['text']) x_train = tfidf.transform(train_df['text']) x_train = x_train.toarray() x_test = tfidf.transform(test_df['text']) x_test = x_test.toarray() print(x_train.shape) print(x_test.shape) y_train = train_df['is_spoiler']
def RandomClassification(MalwareCorpus, GoodwareCorpus, TestSize, NumFeaturesToBeSelected, FeatureOption): ''' Train a classifier for classifying malwares and goodwares using Random Forest technique Compute the prediction accuracy and f1 score of the classifier :param String MalwareCorpus: absolute path of the malware corpus :param String GoodwareCorpus: absolute path of the goodware corpus :param Float TestSize: test set split (default is 0.3 for testing and 0.7 for training) :param integer NumFeaturesToBeSelected: number of top features to select :param Boolean FeatureOption: False ''' # Step 1: Getting the malware and goodware txt files Logger.debug ("Loading positive and negative samples") AllMalSamples = glob.glob(os.path.join(MalwareCorpus,'*txt')) AllGoodSamples = glob.glob(os.path.join(GoodwareCorpus,'*txt')) Logger.info ("All Samples loaded") # Step 2: Creating feature vector FeatureVectorizer = TF(input='filename', lowercase=False, token_pattern=None, tokenizer=MyTokenizer, binary=FeatureOption, dtype=np.float64) X = FeatureVectorizer.fit_transform(AllMalSamples + AllGoodSamples) # Label malware as 1 and goodware as -1 MalLabels = np.ones(len(AllMalSamples)) GoodLabels = np.empty(len(AllGoodSamples)) GoodLabels.fill(-1) Y = np.concatenate((MalLabels, GoodLabels), axis=0) Logger.info("Label array - generated") # Step 3: Split all samples into training and test set XTrain, XTest, YTrain, YTest = train_test_split(X, Y, test_size=TestSize, random_state=randint(0,100)) Logger.debug ("Test set split = %s", TestSize) Features = FeatureVectorizer.get_feature_names() Logger.info ("Total number of features: {} ".format(len(Features))) if len(Features) > NumFeaturesToBeSelected: #with feature selection Logger.info ("Gonna select %s features", NumFeaturesToBeSelected) FSAlgo = SelectKBest(chi2, k = NumFeaturesToBeSelected) XTrain = FSAlgo.fit_transform(XTrain, YTrain) XTest = FSAlgo.transform(XTest) Logger.info ("Gonna perform classification with C-RandomForest") # Step 4: model selection through cross validation # Assuming RandomForest is the only classifier we are gonna try, we will set the n_estimators parameter as follows. Parameters = {'n_estimators': [10,50,100,200,500,1000], 'bootstrap': [True, False], 'criterion': ['gini', 'entropy']} Clf = GridSearchCV(RandomForestClassifier(), Parameters, cv=5, scoring = 'f1', n_jobs=-1) RFmodels = Clf.fit(XTrain, YTrain) BestModel = RFmodels.best_estimator_ Logger.info('CV done - Best model selected: {}'.format(BestModel)) # Best model is chosen through 5-fold cross validation and stored in the variable: RFmodels # Step 5: Evaluate the best model on test set YPred = RFmodels.predict(XTest) Accuracy = accuracy_score(YTest, YPred) print "Test Set Accuracy = ", Accuracy print(metrics.classification_report(YTest, YPred, labels=[1, -1], target_names=['Malware', 'Goodware']))
def load_csbd_vt_familyfold(fold): model = "/data/Alex/malware-tools/csbd/VTFamilyFold/Fold"+str(fold+1)+"/model.pkl" Corpus = '/data/Alex/malware-tools/csbd/data/' FeatureVectorizer = TF(input='filename', lowercase=False, token_pattern=None, tokenizer=MyTokenizer, binary=True, dtype=np.float64) return load_data_vt_familyfold(model, FeatureVectorizer, Corpus, fold, ".txt", 5000)
def RandomClassification(MalwareCorpus, GoodwareCorpus, TestSize, FeatureOption, Model, NumTopFeats): ''' Train a classifier for classifying malwares and goodwares using Support Vector Machine technique. Compute the prediction accuracy and f1 score of the classifier. Modified from Jiachun's code. :param String MalwareCorpus: absolute path of the malware corpus :param String GoodwareCorpus: absolute path of the goodware corpus :param String FeatureOption: tfidf or binary, specify how to construct the feature vector :rtype String Report: result report ''' # step 1: creating feature vector Logger.debug("Loading Malware and Goodware Sample Data") AllMalSamples = CM.ListFiles(MalwareCorpus, ".data") AllGoodSamples = CM.ListFiles(GoodwareCorpus, ".data") AllSampleNames = AllMalSamples + AllGoodSamples Logger.info("Loaded samples") FeatureVectorizer = TF(input='filename', tokenizer=lambda x: x.split('\n'), token_pattern=None, binary=FeatureOption) x = FeatureVectorizer.fit_transform(AllMalSamples + AllGoodSamples) # label malware as 1 and goodware as -1 Mal_labels = np.ones(len(AllMalSamples)) Good_labels = np.empty(len(AllGoodSamples)) Good_labels.fill(-1) y = np.concatenate((Mal_labels, Good_labels), axis=0) Logger.info("Label array - generated") # step 2: split all samples to training set and test set x_train_samplenames, x_test_samplenames, y_train, y_test = train_test_split( AllSampleNames, y, test_size=TestSize, random_state=random.randint(0, 100)) #x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=TestSize, # random_state=random.randint(0, 100)) x_train = FeatureVectorizer.fit_transform(x_train_samplenames) x_test = FeatureVectorizer.transform(x_test_samplenames) Logger.debug("Test set split = %s", TestSize) Logger.info("train-test split done") # step 3: train the model Logger.info("Perform Classification with SVM Model") Parameters = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]} T0 = time.time() if not Model: Clf = GridSearchCV(LinearSVC(), Parameters, cv=5, scoring='f1', n_jobs=-1) SVMModels = Clf.fit(x_train, y_train) Logger.info( "Processing time to train and find best model with GridSearchCV is %s sec." % (round(time.time() - T0, 2))) BestModel = SVMModels.best_estimator_ Logger.info("Best Model Selected : {}".format(BestModel)) print "The training time for random split classification is %s sec." % ( round(time.time() - T0, 2)) print "Enter a filename to save the model:" filename = raw_input() dump(Clf, filename + ".pkl") else: SVMModels = load(Model) BestModel = SVMModels.best_estimator # step 4: Evaluate the best model on test set T0 = time.time() y_pred = SVMModels.predict(x_test) print "The testing time for random split classification is %s sec." % ( round(time.time() - T0, 2)) Accuracy = accuracy_score(y_test, y_pred) print "Test Set Accuracy = {}".format(Accuracy) print( metrics.classification_report(y_test, y_pred, labels=[1, -1], target_names=['Malware', 'Goodware'])) Report = "Test Set Accuracy = " + str( Accuracy ) + "\n" + metrics.classification_report( y_test, y_pred, labels=[1, -1], target_names=['Malware', 'Goodware']) # pointwise multiplication between weight and feature vect w = BestModel.coef_ w = w[0].tolist() v = x_test.toarray() vocab = FeatureVectorizer.get_feature_names() explanations = {os.path.basename(s): {} for s in x_test_samplenames} for i in range(v.shape[0]): wx = v[i, :] * w wv_vocab = zip(wx, vocab) if y_pred[i] == 1: wv_vocab.sort(reverse=True) #print "pred: {}, org: {}".format(y_pred[i],y_test[i]) #pprint(wv_vocab[:10]) explanations[os.path.basename( x_test_samplenames[i] )]['top_features'] = wv_vocab[:NumTopFeats] elif y_pred[i] == -1: wv_vocab.sort() #print "pred: {}, org: {}".format(y_pred[i],y_test[i]) #pprint(wv_vocab[-10:]) explanations[os.path.basename( x_test_samplenames[i] )]['top_features'] = wv_vocab[-NumTopFeats:] explanations[os.path.basename( x_test_samplenames[i])]['original_label'] = y_test[i] explanations[os.path.basename( x_test_samplenames[i])]['predicted_label'] = y_pred[i] with open('explanations_RC.json', 'w') as FH: json.dump(explanations, FH, indent=4) # return TestLabels, PredictedLabels return Report
def KmeansCluster(TrainMalSet, labelcsv, family10csv, FeatureOption): Logger.debug("Loading Malware and Goodware Sample Data for training and testing") TrainMalSamples = getApkList(TrainMalSet, ".data") Logger.info("Loaded Samples") FeatureVectorizer = TF(input="filename", tokenizer=lambda x: x.split('\n'), token_pattern=None, binary=FeatureOption) X = FeatureVectorizer.fit_transform(TrainMalSamples) get_family_dict(labelcsv, family10csv) y_train = [] for file in TrainMalSamples: if "amd" in labelcsv: sha256 = os.path.split(file)[-1][:-5].split('_')[-1] else: sha256 = os.path.split(file)[-1][:-5] if sha256 in family_dict: y_train.append(family_dict[sha256]) else: y_train.append(-1) # test # print(y_train[:20]) kmeans = KMeans(n_clusters=family_count, random_state=10).fit(X) labels = kmeans.labels_ score = fowlkes_mallows_score(y_train, labels) #print(labels[:20]) print(family_count, score) s_train = {} s_clus = {} for i in range(family_count): s_train[i] = [] for j in range(len(y_train)): if str(y_train[j]) == str(i): # print("y_train_j and i", y_train[j], i) s_train[i].append(j) # j is the num. of sample, should be long print(s_train) for i in range(family_count): s_clus[i] = [] for j in range(len(labels)): if str(labels[j]) == str(i): # print("label_j and i", y_train[j], i) s_clus[i].append(j) print(s_clus) label_list = [0] * 10 # index is y_train, value is labels jac_dict = {} for i in range(family_count): max_jac = -1 jac_dict[i] = [0] * family_count for j in range(family_count): jac_v = jaccard_sim(s_clus[i], s_train[j]) jac_dict[i][j] = jac_v if jac_v > max_jac: max_jac = jac_v label_list[i] = j print("label_list is ", label_list) clus_other_fam = {} clus_not_in = {} for clus_num in range(family_count): clus_other_fam[clus_num] = [] clus_not_in[clus_num] = [] fam_num = label_list[clus_num] # find sample should in s_clus[clus_num] but in other for item in s_train[fam_num]: # item is the index(num) of sample if item not in s_clus[clus_num]: clus_other_fam[clus_num].append(label_list[labels[item]]) # chuqu other family num for item in s_clus[clus_num]: if item not in s_train[fam_num]: clus_not_in[clus_num].append(y_train[item]) # not this family but jinlai print("jac matrix: ") print("classnum, jac_matrix, most_possi_fam, out_fam, out_fam_num, in_fam, in_fam_num, should in, " "actual in, rate1, rate2") for clus_num in range(family_count): fam_num = label_list[clus_num] family_name_str = familyselect[fam_num] fam_err_chuqu = [0] * family_count fam_err_jinlai = [0] * family_count for item in clus_other_fam[clus_num]: fam_err_chuqu[item] += 1 for item in clus_not_in[clus_num]: fam_err_jinlai[item] += 1 chuqu_err_max = max(fam_err_chuqu) # chuqu / bengai youde bili1 = format(float(chuqu_err_max) / float(len(s_train[fam_num])), '.4f') jinlai_err_max = max(fam_err_jinlai) # jinlai / shijifenguolai bili2 = format(float(jinlai_err_max) / float(len(s_clus[clus_num])), '.4f') chuqu_fam = fam_err_chuqu.index(chuqu_err_max) jilai_fam = fam_err_jinlai.index(jinlai_err_max) out_fam_name = familyselect[chuqu_fam] if chuqu_err_max == 0: out_fam_name = "" in_fam_name = familyselect[jilai_fam] if jinlai_err_max == 0: in_fam_name = "" print(clus_num, jac_dict[clus_num], family_name_str, out_fam_name, chuqu_err_max, in_fam_name , jinlai_err_max, len(s_train[fam_num]), len(s_clus[clus_num]), fam_err_chuqu, fam_err_jinlai)