def excute(epoch=70, k_fold=10, batch_size=128): data_training = data.get_training_data() # Extract data, X with domain, y with labels X = [x[0] for x in data_training] labels = [x[1] for x in data_training] # To use bigram, we need to convert input_data to vector by using ngram print '*********** CLASSIFIER DGA BY BIGRAM NEURAL NETWORK ***************' print 'Preparing to vectorize input data....' print 'Data will be extracted by ngram to extract features and convert to matrix-term-document...' # Prepare count_vectorize neural_ngram_count_vectorized = feature_extraction.text.CountVectorizer( analyzer='char', ngram_range=(2, 2)) data_training_vectorized = neural_ngram_count_vectorized.fit_transform(X) number_feature = data_training_vectorized.shape[1] print '-' * 100 print ' Analysis data training after vectorized.......' print ' Number domain : %d' % data_training_vectorized.shape[0] print ' Number dimension : %d' % data_training_vectorized.shape[1] print ' List feature : %r' % neural_ngram_count_vectorized.get_feature_names( ) print '-' * 100 # Convert label for binary classifier y = [0 if label == 'legit' else 1 for label in labels] # Convert label for multiclass classifier using one hot encoding report_result = [] last_model = None for fold in range(k_fold): print '>>>>>> Fold %d/%d' % (fold + 1, k_fold) X_train, X_test, y_train, y_test, _, labels_test = train_test_split( data_training_vectorized, y, labels, test_size=0.2) print '[*] Build models neural network for training.........' model = buid_model(number_feature=number_feature) print '[*] Training data.....( using holdout sampling )' X_train_holdout, X_test_holdout, y_train_holdout, y_test_holdout = train_test_split( X_train, y_train, test_size=0.05) best_epoch = -1 best_auc_score = 0.0 temp_report = {} for ep in range(epoch): # Traing data model.fit(X_train_holdout.todense(), y_train_holdout, batch_size=batch_size, epochs=1) # Calculate predict proba and auc score probalities_training = model.predict_proba( X_test_holdout.todense()) auc_score = sklearn.metrics.roc_auc_score(y_test_holdout, probalities_training) print '\n[*] Epoch %d: auc score = %f ( best result = %f )' % ( ep, auc_score, best_auc_score) if auc_score > best_auc_score: best_epoch = ep best_auc_score = auc_score # predict on X_test and cal confusion matrix probalities = model.predict_proba(X_test.todense()) confusion_matrix = sklearn.metrics.confusion_matrix( y_test, probalities > 0.5) temp_report = { 'y': y_test, 'labels': labels_test, 'probs': probalities, 'epoch': ep, 'confusion_matrix': confusion_matrix } print '\n[*] Confusion matrix on epoch %d :' % ep print confusion_matrix else: if (ep - best_epoch) > 6: break print '>>>>> End folf %d' % fold if fold == (k_fold - 1): last_model = model report_result.append(temp_report) try: print '[*] Save model to hard driver :' save_model_to_disk('neural_network_bigram_model', last_model) save_model_to_disk('bigram_count_vectorizer', neural_ngram_count_vectorized) except Exception: print '[XXX] Cannot save model to hard driver :(' print '[*] Finish training data and buiding neural network models.' print '-' * 100 return report_result
def excute(): # Excute analyzing data , building feature and building model to classify dga domain by random forest final_report = dict() try: print '*********** CLASSIFIER DGA BY RANDOM FOREST ***************' print '[*] Loading pandas dataframe.....' data_training = data.get_training_data() # Extract data, X with domain, y with labels X = [x[0] for x in data_training] labels = [x[1] for x in data_training] binary_labels = ['legit' if x == 'legit' else 'dga' for x in labels] domain_dict = { 'domain': X, 'class': labels, 'bin_class': binary_labels } # Build pandas DataFrame dataframe = pd.DataFrame(domain_dict) dataframe = dataframe.dropna() dataframe = dataframe.drop_duplicates() print '[*] DataFrame generate info :' print dataframe.info() # Shuffle data for training and testing dataframe = dataframe.reindex(np.random.permutation(dataframe.index)) print '[*] Shuffle dataframe data.....' print '[*] Dataframe top 20 domain:' print dataframe.head(n=20) # Condition for dataframe classify condition_legit_domain = dataframe['class'] == 'legit' condition_dga_domain = ~condition_legit_domain condition_banjori_domain = dataframe['class'] == 'banjori' condition_corebot_domain = dataframe['class'] == 'corebot' condition_cryptolocker_domain = dataframe['class'] == 'cryptlocker' condition_dircrypt_domain = dataframe['class'] == 'dircrypt' condition_kraken_domain = dataframe['class'] == 'kraken' condition_locky_domain = dataframe['class'] == 'locky' condition_pykspa_domain = dataframe['class'] == 'pykspa' condition_qakbot_domain = dataframe['class'] == 'qakbot' condition_ramdo_domain = dataframe['class'] == 'ramdo' condition_ramnit_domain = dataframe['class'] == 'ramnit' condition_simda_domain = dataframe['class'] == 'simda' print '[*] Total legit ( Alexa based ) domain : %d' % dataframe[ condition_legit_domain].shape[0] print '[*] Total dga domain : %d' % dataframe[ condition_dga_domain].shape[0] # Add length field to dataframe dataframe['length'] = [len(x) for x in dataframe['domain']] # Calculate and add entropy field to dataframe dataframe['entropy'] = [ entropy(domain=domain) for domain in dataframe['domain'] ] print '[*] Show complete dataframe top 50 :' print dataframe.head(n=50) # Draw boxplot for domain length group by class plt.clf() plt.close() dataframe.boxplot('length', 'class') plt.ylabel('Dataframe domain length') dataframe.boxplot('entropy', 'class') plt.ylabel('Dataframe domain entropy') plt.grid(True) # plt.show() plt.savefig(IMAGE_DATA_ANALYZE_PATH + 'box_plot_domain_length_entropy_class.png') # Draw boxplot for domain length and entropy group by bin_class plt.clf() plt.close() dataframe.boxplot('length', 'bin_class') plt.ylabel('Dataframe domain length') dataframe.boxplot('entropy', 'bin_class') plt.ylabel('Dataframe domain entropy') plt.grid(True) # plt.show() plt.savefig(IMAGE_DATA_ANALYZE_PATH + 'box_plot_domain_length_entropy_bin_class.png') # Plot Scatter (length, entropy) for dga and legit dga_domain = dataframe[condition_dga_domain] legit_domain = dataframe[condition_legit_domain] banjori_domain = dataframe[condition_banjori_domain] corebot_domain = dataframe[condition_corebot_domain] cryptolocker_domain = dataframe[condition_cryptolocker_domain] dircrypt_domain = dataframe[condition_dircrypt_domain] kraken_domain = dataframe[condition_kraken_domain] locky_domain = dataframe[condition_locky_domain] pykspa_domain = dataframe[condition_pykspa_domain] qakbot_domain = dataframe[condition_qakbot_domain] ramdo_domain = dataframe[condition_ramdo_domain] ramnit_domain = dataframe[condition_ramnit_domain] simda_domain = dataframe[condition_simda_domain] plt.clf() plt.close() plt.scatter(legit_domain['length'], legit_domain['entropy'], s=300, c='#3bc293', label='Legit', alpha=.2) plt.scatter(dga_domain['length'], dga_domain['entropy'], s=40, c='#60004a', label='DGA', alpha=.3) plt.legend() plt.xlabel('Dataframe domain length') plt.ylabel('Dataframe domain entropy') plt.grid(True) # plt.show() plt.savefig(IMAGE_DATA_ANALYZE_PATH + 'scatter_entropy_length_binary_class.png') plt.clf() plt.close() plt.scatter(legit_domain['length'], legit_domain['entropy'], s=300, c='#3bc293', label='Legit', alpha=.2) plt.scatter(banjori_domain['length'], banjori_domain['entropy'], s=270, c='#003e23', label='banjori', alpha=.3) plt.scatter(corebot_domain['length'], corebot_domain['entropy'], s=240, c='#00263e', label='corebot', alpha=.3) plt.scatter(cryptolocker_domain['length'], cryptolocker_domain['entropy'], s=210, c='#f4cfeb', label='cryptolocker', alpha=.3) plt.scatter(dircrypt_domain['length'], dircrypt_domain['entropy'], s=180, c='#460060', label='dircrypt', alpha=.3) plt.scatter(kraken_domain['length'], kraken_domain['entropy'], s=150, c='#968888', label='kraken', alpha=.3) plt.scatter(locky_domain['length'], locky_domain['entropy'], s=120, c='#112233', label='locky_v2', alpha=.3) plt.scatter(pykspa_domain['length'], pykspa_domain['entropy'], s=90, c='#0e2f44', label='pykspa', alpha=.3) plt.scatter(qakbot_domain['length'], qakbot_domain['entropy'], s=60, c='#fb8d8b', label='qakbot', alpha=.3) plt.scatter(ramdo_domain['length'], ramdo_domain['entropy'], s=30, c='#033e7b', label='ramdo', alpha=.3) plt.scatter(ramnit_domain['length'], ramnit_domain['entropy'], s=15, c='#ffde56', label='ramnit', alpha=.3) plt.scatter(simda_domain['length'], simda_domain['entropy'], s=5, c='#f82831', label='simda', alpha=.3) plt.legend() plt.xlabel('Dataframe domain length') plt.ylabel('Dataframe domain entropy') plt.grid(True) # plt.show() plt.savefig(IMAGE_DATA_ANALYZE_PATH + 'scatter_entropy_length_multiclass.png') # Build feature ( NGRAM base ),we use ngram for chacracter with n in range 3,4,5 # and vectorized input domain through feature extract from domain list # Build legit base feature legit_count_vectorized = sklearn.feature_extraction.text.CountVectorizer( analyzer='char', ngram_range=(3, 5), min_df=1e-4, max_df=1.0) legit_domain_matrix = legit_count_vectorized.fit_transform( legit_domain['domain']) count_each_feature_legit_domain_matrix = np.log10( legit_domain_matrix.sum(axis=0).getA1()) feature_list_legit_domain = legit_count_vectorized.get_feature_names() sorted_count_each_feature_legit_domain = sorted( zip(feature_list_legit_domain, count_each_feature_legit_domain_matrix), key=operator.itemgetter(1), reverse=True) print '[*] Legit number ngram feature: %d' % len( sorted_count_each_feature_legit_domain) number_feature_show = 20 print '>> Top %d highest count ngram feature :' % number_feature_show print '%15s %10s' % ('ngram-feature', 'count') print '-' * 30 for ngram_feature, count in sorted_count_each_feature_legit_domain[: number_feature_show]: print '%15s %10f' % (ngram_feature, count) print '-' * 30 # Build dga base feature dga_count_vectorized = sklearn.feature_extraction.text.CountVectorizer( analyzer='char', ngram_range=(3, 5), min_df=1e-4, max_df=1.0) dga_domain_matrix = dga_count_vectorized.fit_transform( dga_domain['domain']) count_each_feature_dga_domain_matrix = np.log10( dga_domain_matrix.sum(axis=0).getA1()) feature_list_dga_domain = dga_count_vectorized.get_feature_names() sorted_count_each_feature_dga_domain = sorted( zip(feature_list_dga_domain, count_each_feature_dga_domain_matrix), key=operator.itemgetter(1), reverse=True) print '[*] DGA number ngram feature: %d' % len( sorted_count_each_feature_dga_domain) # number_feature_show = 10 print '>> Top %d highest count ngram feature :' % number_feature_show print '%15s %10s' % ('ngram-feature', 'count') print '-' * 30 for ngram_feature, count in sorted_count_each_feature_dga_domain[: number_feature_show]: print '%15s %10f' % (ngram_feature, count) print '-' * 30 # Build dictionary base feature # Load dictionary for calculate later print '[*] Loading dictionary from ~/data/dictionary.txt .....' dictionary_dataframe = pd.read_csv('./data/dictionary.txt', names=['word'], header=None, dtype={'word': np.str}, encoding='utf-8') # Preprocess dictionary_dataframe before use to build training data dictionary_dataframe = dictionary_dataframe[ dictionary_dataframe['word'].map(lambda x: str(x).isalpha())] dictionary_dataframe = dictionary_dataframe.applymap( lambda x: str(x).strip().lower()) dictionary_dataframe = dictionary_dataframe.dropna() dictionary_dataframe = dictionary_dataframe.drop_duplicates() print '[*] Dictionary after preprocessing :' print dictionary_dataframe.head(n=10) # Build count_vectorizer for dictionary to calculate dictionary score for domain dictionary_count_vectorizer = sklearn.feature_extraction.text.CountVectorizer( analyzer='char', ngram_range=(3, 5), min_df=1e-5, max_df=1.0) dictionary_domain_matrix = dictionary_count_vectorizer.fit_transform( dictionary_dataframe['word']) count_each_feature_dictionary_domain_matrix = np.log10( dictionary_domain_matrix.sum(axis=0).getA1()) feature_list_dictionary_domain = dictionary_count_vectorizer.get_feature_names( ) sorted_count_each_feature_dict_domain = sorted( zip(feature_list_dictionary_domain, count_each_feature_dictionary_domain_matrix), key=operator.itemgetter(1), reverse=True) print '[*] Dictionary number ngram feature: %d' % len( sorted_count_each_feature_dict_domain) print '>> Top %d highest count ngram feature :' % number_feature_show print '%15s %10s' % ('ngram-feature', 'count') print '-' * 30 for ngram_feature, count in sorted_count_each_feature_dict_domain[: number_feature_show]: print '%15s %10f' % (ngram_feature, count) print '-' * 30 def ngram_domain_score(domain): legit_score = count_each_feature_legit_domain_matrix * legit_count_vectorized.transform( [domain]).T dict_score = count_each_feature_dictionary_domain_matrix * dictionary_count_vectorizer.transform( [domain]).T dga_score = count_each_feature_dga_domain_matrix * dga_count_vectorized.transform( [domain]).T print '>>>> Domain \'%s\' :\n $legit_score = %f\n $dict_score = %f\n $dga_score = %f' \ % (domain, legit_score, dict_score, dga_score) # Test domain_score in some popular domains ngram_domain_score(domain='google') ngram_domain_score(domain='facebook') ngram_domain_score(domain='vnexpress') ngram_domain_score(domain='tinhte') ngram_domain_score(domain='kenh14') ngram_domain_score(domain='zing') ngram_domain_score(domain='chiasenhac') # Calculate domain score for all domain dataframe['legit_score'] = count_each_feature_legit_domain_matrix * \ legit_count_vectorized.transform(dataframe['domain']).T dataframe['dict_score'] = count_each_feature_dictionary_domain_matrix * \ dictionary_count_vectorizer.transform(dataframe['domain']).T dataframe['dga_score'] = count_each_feature_dga_domain_matrix * \ dga_count_vectorized.transform(dataframe['domain']).T # Show divegence between legit and dict domain # dataframe['legit_score'] > dataframe['dict_score'] => more legit # otherwise => more dict dataframe['divegence_legit_dict'] = dataframe[ 'legit_score'] - dataframe['dict_score'] # Show divegence betwwen legit and dga domain # dataframe['legit_score'] > dataframe['dga_score'] => more legit # otherwise => more dga dataframe['divegence_legit_dga'] = dataframe[ 'legit_score'] - dataframe['dga_score'] # Domain more dictionary than web print '[*] Recognize the domains that are more dictionary than web through div_legit_dict' print dataframe.sort_values(by=['divegence_legit_dict'], ascending=True, kind='quicksort').head(n=10) # Domain more web than dictionary print '[*] Recognize the domains that are more web than dictionary through div_legit_dict' print dataframe.sort_values(by=['divegence_legit_dict'], ascending=False, kind='quicksort').head(n=10) # Domain more dga than legit print '[*] Recognize the domains that are more dga than legit through div_legit_dga' print dataframe.sort_values(by=['divegence_legit_dga'], ascending=True, kind='quicksort').head(n=10) # Domain more legit than dga print '[*] Recognize the domains that are more legit than dga through div_legit_dga' print dataframe.sort_values(by=['divegence_legit_dga'], ascending=False, kind='quicksort').head(n=10) # Visualize effect of divergence # Rebuild sub-dataframe dga_domain = dataframe[condition_dga_domain] legit_domain = dataframe[condition_legit_domain] banjori_domain = dataframe[condition_banjori_domain] corebot_domain = dataframe[condition_corebot_domain] cryptolocker_domain = dataframe[condition_cryptolocker_domain] dircrypt_domain = dataframe[condition_dircrypt_domain] kraken_domain = dataframe[condition_kraken_domain] locky_domain = dataframe[condition_locky_domain] pykspa_domain = dataframe[condition_pykspa_domain] qakbot_domain = dataframe[condition_qakbot_domain] ramdo_domain = dataframe[condition_ramdo_domain] ramnit_domain = dataframe[condition_ramnit_domain] simda_domain = dataframe[condition_simda_domain] # legit_score plot plt.clf() plt.close() plt.scatter(legit_domain['length'], legit_domain['legit_score'], s=300, c='#3bc293', label='Legit', alpha=.2) plt.scatter(dga_domain['length'], dga_domain['legit_score'], s=40, c='#60004a', label='DGA', alpha=.3) plt.legend() plt.xlabel('Dataframe domain length') plt.ylabel('Dataframe domain legit_score') plt.grid(True) # plt.show() plt.savefig(IMAGE_DATA_ANALYZE_PATH + 'legit_score_length.png') plt.clf() plt.close() plt.scatter(legit_domain['entropy'], legit_domain['legit_score'], s=300, c='#3bc293', label='Legit', alpha=.2) plt.scatter(dga_domain['entropy'], dga_domain['legit_score'], s=40, c='#60004a', label='DGA', alpha=.3) plt.legend() plt.xlabel('Dataframe domain entropy') plt.ylabel('Dataframe domain legit_score') plt.grid(True) # plt.show() plt.savefig(IMAGE_DATA_ANALYZE_PATH + 'legit_score_entropy.png') plt.clf() plt.close() plt.scatter(legit_domain['length'], legit_domain['legit_score'], s=300, c='#3bc293', label='Legit', alpha=.2) plt.scatter(banjori_domain['length'], banjori_domain['legit_score'], s=270, c='#003e23', label='banjori', alpha=.3) plt.scatter(corebot_domain['length'], corebot_domain['legit_score'], s=240, c='#00263e', label='corebot', alpha=.3) plt.scatter(cryptolocker_domain['length'], cryptolocker_domain['legit_score'], s=210, c='#f4cfeb', label='cryptolocker', alpha=.3) plt.scatter(dircrypt_domain['length'], dircrypt_domain['legit_score'], s=180, c='#460060', label='dircrypt', alpha=.3) plt.scatter(kraken_domain['length'], kraken_domain['legit_score'], s=150, c='#968888', label='kraken', alpha=.3) plt.scatter(locky_domain['length'], locky_domain['legit_score'], s=120, c='#112233', label='locky_v2', alpha=.3) plt.scatter(pykspa_domain['length'], pykspa_domain['legit_score'], s=90, c='#0e2f44', label='pykspa', alpha=.3) plt.scatter(qakbot_domain['length'], qakbot_domain['legit_score'], s=60, c='#fb8d8b', label='qakbot', alpha=.3) plt.scatter(ramdo_domain['length'], ramdo_domain['legit_score'], s=30, c='#033e7b', label='ramdo', alpha=.3) plt.scatter(ramnit_domain['length'], ramnit_domain['legit_score'], s=15, c='#ffde56', label='ramnit', alpha=.3) plt.scatter(simda_domain['length'], simda_domain['legit_score'], s=5, c='#f82831', label='simda', alpha=.3) plt.legend() plt.xlabel('Dataframe domain length') plt.ylabel('Dataframe domain legit_score') plt.grid(True) # plt.show() plt.savefig(IMAGE_DATA_ANALYZE_PATH + 'legit_score_length_multi.png') plt.clf() plt.close() plt.scatter(legit_domain['entropy'], legit_domain['legit_score'], s=300, c='#3bc293', label='Legit', alpha=.2) plt.scatter(banjori_domain['entropy'], banjori_domain['legit_score'], s=270, c='#003e23', label='banjori', alpha=.3) plt.scatter(corebot_domain['entropy'], corebot_domain['legit_score'], s=240, c='#00263e', label='corebot', alpha=.3) plt.scatter(cryptolocker_domain['entropy'], cryptolocker_domain['legit_score'], s=210, c='#f4cfeb', label='cryptolocker', alpha=.3) plt.scatter(dircrypt_domain['entropy'], dircrypt_domain['legit_score'], s=180, c='#460060', label='dircrypt', alpha=.3) plt.scatter(kraken_domain['entropy'], kraken_domain['legit_score'], s=150, c='#968888', label='kraken', alpha=.3) plt.scatter(locky_domain['entropy'], locky_domain['legit_score'], s=120, c='#112233', label='locky_v2', alpha=.3) plt.scatter(pykspa_domain['entropy'], pykspa_domain['legit_score'], s=90, c='#0e2f44', label='pykspa', alpha=.3) plt.scatter(qakbot_domain['entropy'], qakbot_domain['legit_score'], s=60, c='#fb8d8b', label='qakbot', alpha=.3) plt.scatter(ramdo_domain['entropy'], ramdo_domain['legit_score'], s=30, c='#033e7b', label='ramdo', alpha=.3) plt.scatter(ramnit_domain['entropy'], ramnit_domain['legit_score'], s=15, c='#ffde56', label='ramnit', alpha=.3) plt.scatter(simda_domain['entropy'], simda_domain['legit_score'], s=5, c='#f82831', label='simda', alpha=.3) plt.legend() plt.xlabel('Dataframe domain entropy') plt.ylabel('Dataframe domain legit_score') plt.grid(True) # plt.show() plt.savefig(IMAGE_DATA_ANALYZE_PATH + 'legit_score_entropy_multi.png') # dict_score plot plt.clf() plt.close() plt.scatter(legit_domain['length'], legit_domain['dict_score'], s=300, c='#3bc293', label='Legit', alpha=.2) plt.scatter(dga_domain['length'], dga_domain['dict_score'], s=40, c='#60004a', label='DGA', alpha=.3) plt.legend() plt.xlabel('Dataframe domain length') plt.ylabel('Dataframe domain dict_score') plt.grid(True) # plt.show() plt.savefig(IMAGE_DATA_ANALYZE_PATH + 'dict_score_length.png') plt.clf() plt.close() plt.scatter(legit_domain['entropy'], legit_domain['dict_score'], s=300, c='#3bc293', label='Legit', alpha=.2) plt.scatter(dga_domain['entropy'], dga_domain['dict_score'], s=40, c='#60004a', label='DGA', alpha=.3) plt.legend() plt.xlabel('Dataframe domain entropy') plt.ylabel('Dataframe domain dict_score') plt.grid(True) # plt.show() plt.savefig(IMAGE_DATA_ANALYZE_PATH + 'dict_score_entropy.png') plt.clf() plt.close() plt.scatter(legit_domain['length'], legit_domain['dict_score'], s=300, c='#3bc293', label='Legit', alpha=.2) plt.scatter(banjori_domain['length'], banjori_domain['dict_score'], s=270, c='#003e23', label='banjori', alpha=.3) plt.scatter(corebot_domain['length'], corebot_domain['dict_score'], s=240, c='#00263e', label='corebot', alpha=.3) plt.scatter(cryptolocker_domain['length'], cryptolocker_domain['dict_score'], s=210, c='#f4cfeb', label='cryptolocker', alpha=.3) plt.scatter(dircrypt_domain['length'], dircrypt_domain['dict_score'], s=180, c='#460060', label='dircrypt', alpha=.3) plt.scatter(kraken_domain['length'], kraken_domain['dict_score'], s=150, c='#968888', label='kraken', alpha=.3) plt.scatter(locky_domain['length'], locky_domain['dict_score'], s=120, c='#112233', label='locky_v2', alpha=.3) plt.scatter(pykspa_domain['length'], pykspa_domain['dict_score'], s=90, c='#0e2f44', label='pykspa', alpha=.3) plt.scatter(qakbot_domain['length'], qakbot_domain['dict_score'], s=60, c='#fb8d8b', label='qakbot', alpha=.3) plt.scatter(ramdo_domain['length'], ramdo_domain['dict_score'], s=30, c='#033e7b', label='ramdo', alpha=.3) plt.scatter(ramnit_domain['length'], ramnit_domain['dict_score'], s=15, c='#ffde56', label='ramnit', alpha=.3) plt.scatter(simda_domain['length'], simda_domain['dict_score'], s=5, c='#f82831', label='simda', alpha=.3) plt.legend() plt.xlabel('Dataframe domain length') plt.ylabel('Dataframe domain dict_score') plt.grid(True) # plt.show() plt.savefig(IMAGE_DATA_ANALYZE_PATH + 'dict_score_length_multi.png') plt.clf() plt.close() plt.scatter(legit_domain['entropy'], legit_domain['dict_score'], s=300, c='#3bc293', label='Legit', alpha=.2) plt.scatter(banjori_domain['entropy'], banjori_domain['dict_score'], s=270, c='#003e23', label='banjori', alpha=.3) plt.scatter(corebot_domain['entropy'], corebot_domain['dict_score'], s=240, c='#00263e', label='corebot', alpha=.3) plt.scatter(cryptolocker_domain['entropy'], cryptolocker_domain['dict_score'], s=210, c='#f4cfeb', label='cryptolocker', alpha=.3) plt.scatter(dircrypt_domain['entropy'], dircrypt_domain['dict_score'], s=180, c='#460060', label='dircrypt', alpha=.3) plt.scatter(kraken_domain['entropy'], kraken_domain['dict_score'], s=150, c='#968888', label='kraken', alpha=.3) plt.scatter(locky_domain['entropy'], locky_domain['dict_score'], s=120, c='#112233', label='locky_v2', alpha=.3) plt.scatter(pykspa_domain['entropy'], pykspa_domain['dict_score'], s=90, c='#0e2f44', label='pykspa', alpha=.3) plt.scatter(qakbot_domain['entropy'], qakbot_domain['dict_score'], s=60, c='#fb8d8b', label='qakbot', alpha=.3) plt.scatter(ramdo_domain['entropy'], ramdo_domain['dict_score'], s=30, c='#033e7b', label='ramdo', alpha=.3) plt.scatter(ramnit_domain['entropy'], ramnit_domain['dict_score'], s=15, c='#ffde56', label='ramnit', alpha=.3) plt.scatter(simda_domain['entropy'], simda_domain['dict_score'], s=5, c='#f82831', label='simda', alpha=.3) plt.legend() plt.xlabel('Dataframe domain entropy') plt.ylabel('Dataframe domain dict_score') plt.grid(True) # plt.show() plt.savefig(IMAGE_DATA_ANALYZE_PATH + 'dict_score_entropy_multi.png') # dga_score plot plt.clf() plt.close() plt.scatter(legit_domain['length'], legit_domain['dga_score'], s=300, c='#3bc293', label='Legit', alpha=.2) plt.scatter(dga_domain['length'], dga_domain['dga_score'], s=40, c='#60004a', label='DGA', alpha=.3) plt.legend() plt.xlabel('Dataframe domain length') plt.ylabel('Dataframe domain dga_score') plt.grid(True) # plt.show() plt.savefig(IMAGE_DATA_ANALYZE_PATH + 'dga_score_length.png') plt.clf() plt.close() plt.scatter(legit_domain['entropy'], legit_domain['dga_score'], s=300, c='#3bc293', label='Legit', alpha=.2) plt.scatter(dga_domain['entropy'], dga_domain['dga_score'], s=40, c='#60004a', label='DGA', alpha=.3) plt.legend() plt.xlabel('Dataframe domain entropy') plt.ylabel('Dataframe domain dga_score') plt.grid(True) # plt.show() plt.savefig(IMAGE_DATA_ANALYZE_PATH + 'dga_score_entropy.png') plt.clf() plt.close() plt.scatter(legit_domain['length'], legit_domain['dga_score'], s=300, c='#3bc293', label='Legit', alpha=.2) plt.scatter(banjori_domain['length'], banjori_domain['dga_score'], s=270, c='#003e23', label='banjori', alpha=.3) plt.scatter(corebot_domain['length'], corebot_domain['dga_score'], s=240, c='#00263e', label='corebot', alpha=.3) plt.scatter(cryptolocker_domain['length'], cryptolocker_domain['dga_score'], s=210, c='#f4cfeb', label='cryptolocker', alpha=.3) plt.scatter(dircrypt_domain['length'], dircrypt_domain['dga_score'], s=180, c='#460060', label='dircrypt', alpha=.3) plt.scatter(kraken_domain['length'], kraken_domain['dga_score'], s=150, c='#968888', label='kraken', alpha=.3) plt.scatter(locky_domain['length'], locky_domain['dga_score'], s=120, c='#112233', label='locky_v2', alpha=.3) plt.scatter(pykspa_domain['length'], pykspa_domain['dga_score'], s=90, c='#0e2f44', label='pykspa', alpha=.3) plt.scatter(qakbot_domain['length'], qakbot_domain['dga_score'], s=60, c='#fb8d8b', label='qakbot', alpha=.3) plt.scatter(ramdo_domain['length'], ramdo_domain['dga_score'], s=30, c='#033e7b', label='ramdo', alpha=.3) plt.scatter(ramnit_domain['length'], ramnit_domain['dga_score'], s=15, c='#ffde56', label='ramnit', alpha=.3) plt.scatter(simda_domain['length'], simda_domain['dga_score'], s=5, c='#f82831', label='simda', alpha=.3) plt.legend() plt.xlabel('Dataframe domain length') plt.ylabel('Dataframe domain dga_score') plt.grid(True) # plt.show() plt.savefig(IMAGE_DATA_ANALYZE_PATH + 'dga_score_length_multi.png') plt.clf() plt.close() plt.scatter(legit_domain['entropy'], legit_domain['dga_score'], s=300, c='#3bc293', label='Legit', alpha=.2) plt.scatter(banjori_domain['entropy'], banjori_domain['dga_score'], s=270, c='#003e23', label='banjori', alpha=.3) plt.scatter(corebot_domain['entropy'], corebot_domain['dga_score'], s=240, c='#00263e', label='corebot', alpha=.3) plt.scatter(cryptolocker_domain['entropy'], cryptolocker_domain['dga_score'], s=210, c='#f4cfeb', label='cryptolocker', alpha=.3) plt.scatter(dircrypt_domain['entropy'], dircrypt_domain['dga_score'], s=180, c='#460060', label='dircrypt', alpha=.3) plt.scatter(kraken_domain['entropy'], kraken_domain['dga_score'], s=150, c='#968888', label='kraken', alpha=.3) plt.scatter(locky_domain['entropy'], locky_domain['dga_score'], s=120, c='#112233', label='locky_v2', alpha=.3) plt.scatter(pykspa_domain['entropy'], pykspa_domain['dga_score'], s=90, c='#0e2f44', label='pykspa', alpha=.3) plt.scatter(qakbot_domain['entropy'], qakbot_domain['dga_score'], s=60, c='#fb8d8b', label='qakbot', alpha=.3) plt.scatter(ramdo_domain['entropy'], ramdo_domain['dga_score'], s=30, c='#033e7b', label='ramdo', alpha=.3) plt.scatter(ramnit_domain['entropy'], ramnit_domain['dga_score'], s=15, c='#ffde56', label='ramnit', alpha=.3) plt.scatter(simda_domain['entropy'], simda_domain['dga_score'], s=5, c='#f82831', label='simda', alpha=.3) plt.legend() plt.xlabel('Dataframe domain entropy') plt.ylabel('Dataframe domain dga_score') plt.grid(True) # plt.show() plt.savefig(IMAGE_DATA_ANALYZE_PATH + 'dga_score_entropy_multi.png') print '[*] Analyze fields in dataframe :' print '>>> Legit domain' print dataframe[condition_legit_domain].describe() print '>>> Dga domain' print dataframe[condition_dga_domain].describe() print '-' * 30 # Plot Histogram max_score of legit domain max_ngram = np.maximum(legit_domain['legit_score'], legit_domain['dict_score']) # plt.hist(max_ngram, bins=150, histtype='stepfilled', color='#003e23') plt.clf() plt.close() plt.hist(max_ngram, bins=150, histtype='step', color='#f82831') plt.suptitle('Histogram of the max legit_dict_score for legit domain') plt.xlabel('Max (legit,dictionary) score') plt.ylabel('Frequency') plt.grid(True) # plt.show() plt.savefig(IMAGE_DATA_ANALYZE_PATH + 'histogram_legit.png') # Plot Histogram dga_score plt.clf() plt.close() plt.hist(np.array(dga_domain['dga_score']), bins=150, histtype='step', color='#003300') plt.suptitle('Histogram of the dga_score for dga domain') plt.xlabel('dga_score') plt.ylabel('Frequency') plt.grid(True) # plt.show() plt.savefig(IMAGE_DATA_ANALYZE_PATH + 'histogram_dga.png') # Export dataframe to json/csv dataframe.to_json('./export/dataframe.json') dataframe.to_csv('./export/dataframe.csv') # Finish analyze data and preprocessing step, next step I build # model RandomForest to training and classify dga domain randomforest_labels = ['legit', 'dga'] randomforest_multilabels = [ 'legit', 'banjori', 'corebot', 'cryptolocker', 'dircrypt', 'kraken', 'locky', 'pykspa', 'qakbot', 'ramdo', 'ramnit', 'simda' ] # X -> dataset to training, y -> labels X = dataframe.as_matrix( ['length', 'entropy', 'legit_score', 'dict_score', 'dga_score']) y = np.array(dataframe['bin_class']) y_multiclass = np.array(dataframe['class']) y_multiclass_binarize = label_binarize( y_multiclass, classes=randomforest_multilabels) randomforest_multilabels_binarize = label_binarize( randomforest_multilabels, classes=randomforest_multilabels) # RandomForest model to classify randomforest_clf = RandomForestClassifier(n_estimators=20, criterion='entropy', bootstrap=True, n_jobs=-1) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) print '[*] Training data......' randomforest_clf.fit(X_train, y_train) # For multi class, convert y_test to bin y_test_bin = label_binarize(y_test, randomforest_clf.classes_) num_class = y_test_bin.shape[1] y_test_bin = np.array([x.astype(int) for x in y_test_bin]) # For transform index <-> class easier index_class_dict = {} class_index_dict = {} for idx, x in zip(range(len(randomforest_clf.classes_)), randomforest_clf.classes_): index_class_dict[idx] = x class_index_dict[x] = idx print '[*] Testing data.....' y_predict = randomforest_clf.predict(X_test) y_predict_prob = randomforest_clf.predict_proba(X_test) confusion_matrix = sklearn.metrics.confusion_matrix( y_test, y_predict, randomforest_labels) print '[*] Show confusion matrix :' print confusion_matrix print '[*] Analyzing result.......' show_confusion_matrix(confusion_matrix, randomforest_labels) importances_feature = zip( ['length', 'entropy', 'legit_score', 'dict_score', 'dga_score'], randomforest_clf.feature_importances_) print '==>> Importance feature :' for feature, importance in importances_feature: print ' > Feature \'%s\'(entropy) = %f' % (feature, importance) print '[*] List class recognize by randomforest classifier :' print randomforest_clf.classes_ prob_perclass_dict = dict() for index, class_name in enumerate(randomforest_clf.classes_): prob_perclass_dict[class_name] = y_predict_prob[:, index] # Convert label to prob value y_test_true_table = [0 if label == 'legit' else 1 for label in y_test] y_test_true_table_multi = [ 1 if label == 'legit' else 0 for label in y_test ] # Calculate fpr, tpr to plot roc curve and calculate auc fpr, tpr, threadshold = sklearn.metrics.roc_curve( y_test_true_table, np.array(prob_perclass_dict['dga'])) auc = sklearn.metrics.auc(fpr, tpr) print '[*] AUC is %f' % auc # Train on whole dataframe print '[*] Training whole dataset.......' randomforest_clf.fit(X, y) # randomforest_clf.fit(X, y_multiclass) def quick_test(uri): domain = domain_extract(uri=uri) legit_score = count_each_feature_legit_domain_matrix * legit_count_vectorized.transform( [domain]).T dict_score = count_each_feature_dictionary_domain_matrix * dictionary_count_vectorizer.transform( [domain]).T dga_score = count_each_feature_dga_domain_matrix * dga_count_vectorized.transform( [domain]).T vectorized_domain = [ len(domain), entropy(domain), legit_score, dict_score, dga_score ] print '>>>> Test domain \'%s\' : %s' % ( uri, randomforest_clf.predict(vectorized_domain)[0]) # Test on some domain legit and dga print '[*] Test on some domain : ' quick_test('google.com.vn') quick_test('vnexpress.net') quick_test('tinhte.vn') quick_test('kenh14.vn') quick_test('40a43e61e56a5c218cf6c22aca27f7ee.org') quick_test('agabgtdhgsbspwsq.ru') quick_test('dantri.net') quick_test('axtopsbtntqnfdyk.ru') quick_test('ahamove.com.vn') quick_test('batqeodiji.com') quick_test('bdjhtgqhggicwrmy.ru') quick_test('melhlehkvxoxbqq.net') # WannaCry kill switch domain quick_test('www.iuqerfsodp9ifjaposdfjhgosurijfaewrwergwea.com') # Same weird domain =)) but i want to compare to WannaCry kill switch domain quick_test('www.hungdetraicogisaikhongthebaolakhongdeptrai.com') print '==>> Finish testing' # Save whole model print '[*] Save model to hard driver:' save_model_to_disk('legit_count_matrix', count_each_feature_legit_domain_matrix) save_model_to_disk('legit_count_vectorizer', legit_count_vectorized) save_model_to_disk('dictionary_count_matrix', count_each_feature_dictionary_domain_matrix) save_model_to_disk('dictionary_count_vectorizer', dictionary_count_vectorizer) save_model_to_disk('dga_count_matrix', count_each_feature_dga_domain_matrix) save_model_to_disk('dga_count_vectorizer', dga_count_vectorized) save_model_to_disk('random_forest_classifier', randomforest_clf) plt.close() final_report = { 'y': y_test_true_table, 'labels': y_test, 'probs': prob_perclass_dict, 'epoch': 0, 'confusion_matrix': confusion_matrix } except KeyboardInterrupt: print '>>>>>>>> Terminating.....' sys.exit(0) except Exception, error: print '>>>>>>>> Cannot build model :(....' print traceback.print_exc() print 'Error occur : %s' % (str(error)) sys.exit(1)
def excute(epoch=70, k_fold=10, batch_size=128): data_training = data.get_training_data() # Extract data, X with domain, y with labels X = [x[0] for x in data_training] labels = [x[1] for x in data_training] # To use bigram, we need to convert input_data to vector by using ngram print '*********** CLASSIFIER Multi DGA BY BIGRAM NEURAL NETWORK ***************' print 'Preparing to vectorize input data....' print 'Data will be extracted by ngram to extract features and convert to matrix-term-document...' # Prepare count_vectorize neural_ngram_count_vectorized = feature_extraction.text.CountVectorizer(analyzer='char', ngram_range=(2, 2)) data_training_vectorized = neural_ngram_count_vectorized.fit_transform(X) number_feature = data_training_vectorized.shape[1] print '-' * 100 print ' Analysis data training after vectorized.......' print ' Number domain : %d' % data_training_vectorized.shape[0] print ' Number dimension : %d' % data_training_vectorized.shape[1] print ' List feature : %r' % neural_ngram_count_vectorized.get_feature_names() print '-' * 100 # For easy mapping name_class = ['legit', 'banjori', 'corebot', 'cryptolocker', 'dircrypt', 'kraken', 'locky', 'pykspa', 'qakbot', 'ramdo', 'ramnit', 'simda'] label_dict = {} label_index_to_name_dict = {} for idx, name in zip(range(len(name_class)), name_class): label_dict[name] = idx label_index_to_name_dict[idx] = name # Indexing labels y = [label_dict[name] for name in labels] index_labels = y # Convert label for multiclass classifier using one-hot-encoding y = keras.utils.to_categorical(y=y, num_classes=len(name_class)) report_result = [] last_model = None for fold in range(k_fold): print '>>>>>> Fold %d/%d' % (fold+1, k_fold) X_train, X_test, y_train, y_test, _, labels_test = train_test_split(data_training_vectorized, y, index_labels, test_size=0.2) print '[*] Build models neural network for training.........' model = buid_model(number_feature=number_feature, number_class=len(name_class)) print '[*] Training data.....( using holdout sampling )' X_train_holdout, X_test_holdout, y_train_holdout, y_test_holdout = train_test_split(X_train, y_train, test_size=0.05) best_epoch = -1 best_auc_score = 0.0 temp_report = {} for ep in range(epoch): # Traing data model.fit(X_train_holdout.todense(), y_train_holdout, batch_size=batch_size, epochs=1) # Calculate predict proba and auc score probalities_training = model.predict_proba(X_test_holdout.todense()) pred_class = model.predict_classes(X_test_holdout.todense()) # Calculate micro AUC tpr = {} fpr = {} roc_auc = {} for name in name_class: fpr[name], tpr[name], _ = roc_curve(y_test_holdout[:,label_dict[name]], probalities_training[:,label_dict[name]]) roc_auc[name] = auc(fpr[name], tpr[name]) fpr['micro'], tpr['micro'], _ = roc_curve(y_test_holdout.ravel(), probalities_training.ravel()) roc_auc['micro'] = auc(fpr['micro'], tpr['micro']) auc_score = roc_auc['micro'] print '\n[*] Epoch %d: micro auc score = %f ( best result = %f )' % (ep, auc_score, best_auc_score) if auc_score > best_auc_score: best_epoch = ep best_auc_score = auc_score # predict on X_test and cal confusion matrix probalities = model.predict_proba(X_test.todense()) pred_class = model.predict_classes(X_test.todense()) confusion_matrix = sklearn.metrics.confusion_matrix(labels_test, pred_class, label_dict.values() ) temp_report = {'y': y_test, 'labels': labels_test, 'probs': probalities, 'epoch': ep, 'confusion_matrix': confusion_matrix, 'name_to_index': label_dict, 'index_to_name': label_index_to_name_dict} print '\n[*] Confusion matrix on epoch %d :' % ep print confusion_matrix else: if (ep - best_epoch) > 6: break print '>>>>> End folf %d' % fold if fold == (k_fold - 1): last_model = model report_result.append(temp_report) try: print '[*] Save model to hard driver :' save_model_to_disk('neural_network_bigram_multiclass_model', last_model) save_model_to_disk('bigram_count_vectorizer_multiclass', neural_ngram_count_vectorized) except Exception: print '[XXX] Cannot save model to hard driver :(' print '[*] Finish training data and buiding neural network models.' print '-' * 100 return report_result
def excute(epoch=25, k_fold=10, batch_size=128): data_training = data.get_training_data() # Extract data, X with domain, y with labels X = [x[0] for x in data_training] labels = [x[1] for x in data_training] # Build dictionary of characters char_dict = {x: index + 1 for index, x in enumerate(set(''.join(X)))} # Add 1 to len(char_dict) for special character input_range = len(char_dict) + 1 maxlen = np.max([len(x) for x in X]) lstm_vectorizer = Vectorizer(char_dict) # Convert list domains to vector and padding zeros to get list vector with length = maxlen X = [[char_dict[character] for character in domain] for domain in X] X = sequence.pad_sequences(X, maxlen=maxlen) # For easy mapping name_class = [ 'legit', 'banjori', 'corebot', 'cryptolocker', 'dircrypt', 'kraken', 'locky', 'pykspa', 'qakbot', 'ramdo', 'ramnit', 'simda' ] label_dict = {} label_index_to_name_dict = {} for idx, name in zip(range(len(name_class)), name_class): label_dict[name] = idx label_index_to_name_dict[idx] = name # Indexing labels y = [label_dict[name] for name in labels] index_labels = y # Convert label for multiclass classifier using one hot encoding y = keras.utils.to_categorical(y=y, num_classes=len(name_class)) report_result = [] last_model = None print '\n*********** Multi CLASSIFIER DGA BY LSTM RNN ( FEATURELESS ) ***************' print '[*] Preparing....' for fold in range(k_fold): print '>>>>>> Fold %d/%d' % (fold + 1, k_fold) X_train, X_test, y_train, y_test, _, labels_test = train_test_split( X, y, index_labels, test_size=0.2) print '[*] Build models neural network for training.........' model = buid_model(input_range=input_range, maxlen=maxlen, number_class=len(name_class)) print '[*] Training data.....( using holdout sampling )' X_train_holdout, X_test_holdout, y_train_holdout, y_test_holdout = train_test_split( X_train, y_train, test_size=0.06) best_epoch = -1 best_auc_score = 0.0 temp_report = {} for ep in range(epoch): # Traing data model.fit(X_train_holdout, y_train_holdout, batch_size=batch_size, epochs=1) # Calculate predict proba and auc score probalities_training = model.predict_proba(X_test_holdout) pred_class = model.predict_classes(X_test_holdout) # Calculate micro AUC tpr = {} fpr = {} roc_auc = {} for name in name_class: fpr[name], tpr[name], _ = roc_curve( y_test_holdout[:, label_dict[name]], probalities_training[:, label_dict[name]]) roc_auc[name] = auc(fpr[name], tpr[name]) fpr['micro'], tpr['micro'], _ = roc_curve( y_test_holdout.ravel(), probalities_training.ravel()) roc_auc['micro'] = auc(fpr['micro'], tpr['micro']) auc_score = roc_auc['micro'] print '\n[*] Epoch %d: micro auc score = %f ( best result = %f )' % ( ep, auc_score, best_auc_score) if auc_score > best_auc_score: best_epoch = ep best_auc_score = auc_score # predict on X_test and cal confusion matrix probalities = model.predict_proba(X_test) pred_class = model.predict_classes(X_test) confusion_matrix = sklearn.metrics.confusion_matrix( labels_test, pred_class, label_dict.values()) temp_report = { 'y': y_test, 'labels': labels_test, 'probs': probalities, 'epoch': ep, 'confusion_matrix': confusion_matrix, 'name_to_index': label_dict, 'index_to_name': label_index_to_name_dict } print '\n[*] Confusion matrix on epoch %d :' % ep print confusion_matrix else: if (ep - best_epoch) > 2: break print '>>>>> End folf %d' % fold # Save last model if fold == (k_fold - 1): last_model = model report_result.append(temp_report) try: print '[*] Save model to hard driver :' save_model_to_disk('neural_network_lstm_model', last_model) save_model_to_disk('lstm_vectorizer', lstm_vectorizer) except Exception: print '[XXX] Cannot save model to hard driver :(' print '[*] Finish training data and buiding neural network models' print '-' * 100 return report_result
def excute(epoch=25, k_fold=10, batch_size=128): data_training = data.get_training_data() # Extract data, X with domain, y with labels X = [x[0] for x in data_training] labels = [x[1] for x in data_training] # Build dictionary of characters char_dict = {x: index + 1 for index, x in enumerate(set(''.join(X)))} # Add 1 to len(char_dict) for special character input_range = len(char_dict) + 1 maxlen = np.max([len(x) for x in X]) lstm_vectorizer = Vectorizer(char_dict) # Convert list domains to vector and padding zeros to get list vector with length = maxlen X = [[char_dict[character] for character in domain] for domain in X] X = sequence.pad_sequences(X, maxlen=maxlen) # Covert labels to binary classifier y = [0 if label == 'legit' else 1 for label in labels] # Convert labels to multiclass classifier report_result = [] print '\n*********** CLASSIFIER DGA BY LSTM RNN ( FEATURELESS ) ***************' print '[*] Preparing....' last_model = None for fold in range(k_fold): print '>>>>>> Fold %d/%d' % (fold + 1, k_fold) X_train, X_test, y_train, y_test, _, labels_test = train_test_split( X, y, labels, test_size=0.2) print '[*] Build models neural network for training.........' model = buid_model(input_range=input_range, maxlen=maxlen) print '[*] Training data.....( using holdout sampling )' X_train_holdout, X_test_holdout, y_train_holdout, y_test_holdout = train_test_split( X_train, y_train, test_size=0.06) best_epoch = -1 best_auc_score = 0.0 temp_report = {} for ep in range(epoch): # Traing data model.fit(X_train_holdout, y_train_holdout, batch_size=batch_size, epochs=1) # Calculate predict proba and auc score probalities_training = model.predict_proba(X_test_holdout) auc_score = sklearn.metrics.roc_auc_score(y_test_holdout, probalities_training) print '\n[*] Epoch %d: auc score = %f ( best result = %f )' % ( ep, auc_score, best_auc_score) if auc_score > best_auc_score: best_epoch = ep best_auc_score = auc_score # predict on X_test and cal confusion matrix probalities = model.predict_proba(X_test) confusion_matrix = sklearn.metrics.confusion_matrix( y_test, probalities > 0.5) temp_report = { 'y': y_test, 'labels': labels_test, 'probs': probalities, 'epoch': ep, 'confusion_matrix': confusion_matrix } print '\n[*] Confusion matrix on epoch %d :' % ep print confusion_matrix else: if (ep - best_epoch) > 2: break print '>>>>> End folf %d' % fold # Save last model if fold == (k_fold - 1): last_model = model report_result.append(temp_report) try: print '[*] Save model to hard driver :' save_model_to_disk('neural_network_lstm_model', last_model) save_model_to_disk('lstm_vectorizer', lstm_vectorizer) except Exception: print '[XXX] Cannot save model to hard driver :(' print '[*] Finish training data and buiding neural network models' print '-' * 100 return report_result
def excute(): # Excute analyzing data , building feature and building model to classify dga domain by random forest final_report = dict() try: print '*********** CLASSIFIER DGA BY RANDOM FOREST ***************' print '[*] Loading pandas dataframe.....' data_training = data.get_training_data() # Extract data, X with domain, y with labels X = [x[0] for x in data_training] labels = [x[1] for x in data_training] binary_labels = ['legit' if x == 'legit' else 'dga' for x in labels] domain_dict = {'domain': X, 'class': labels, 'bin_class': binary_labels} # Build pandas DataFrame dataframe = pd.DataFrame(domain_dict) dataframe = dataframe.dropna() dataframe = dataframe.drop_duplicates() print '[*] DataFrame generate info :' print dataframe.info() # Shuffle data for training and testing dataframe = dataframe.reindex(np.random.permutation(dataframe.index)) print '[*] Shuffle dataframe data.....' print '[*] Dataframe top 20 domain:' print dataframe.head(n=20) # Condition for dataframe classify condition_legit_domain = dataframe['class'] == 'legit' condition_dga_domain = ~condition_legit_domain condition_banjori_domain = dataframe['class'] == 'banjori' condition_corebot_domain = dataframe['class'] == 'corebot' condition_cryptolocker_domain = dataframe['class'] == 'cryptlocker' condition_dircrypt_domain = dataframe['class'] == 'dircrypt' condition_kraken_domain = dataframe['class'] == 'kraken' condition_locky_domain = dataframe['class'] == 'locky' condition_pykspa_domain = dataframe['class'] == 'pykspa' condition_qakbot_domain = dataframe['class'] == 'qakbot' condition_ramdo_domain = dataframe['class'] == 'ramdo' condition_ramnit_domain = dataframe['class'] == 'ramnit' condition_simda_domain = dataframe['class'] == 'simda' print '[*] Total legit ( Alexa based ) domain : %d' % dataframe[condition_legit_domain].shape[0] print '[*] Total dga domain : %d' % dataframe[condition_dga_domain].shape[0] # Add length field to dataframe dataframe['length'] = [len(x) for x in dataframe['domain']] # Calculate and add entropy field to dataframe dataframe['entropy'] = [entropy(domain=domain) for domain in dataframe['domain']] print '[*] Show complete dataframe top 50 :' print dataframe.head(n=50) # Plot Scatter (length, entropy) for dga and legit dga_domain = dataframe[condition_dga_domain] legit_domain = dataframe[condition_legit_domain] banjori_domain = dataframe[condition_banjori_domain] corebot_domain = dataframe[condition_corebot_domain] cryptolocker_domain = dataframe[condition_cryptolocker_domain] dircrypt_domain = dataframe[condition_dircrypt_domain] kraken_domain = dataframe[condition_kraken_domain] locky_domain = dataframe[condition_locky_domain] pykspa_domain = dataframe[condition_pykspa_domain] qakbot_domain = dataframe[condition_qakbot_domain] ramdo_domain = dataframe[condition_ramdo_domain] ramnit_domain = dataframe[condition_ramnit_domain] simda_domain = dataframe[condition_simda_domain] # Build feature ( NGRAM base ),we use ngram for chacracter with n in range 3,4,5 # and vectorized input domain through feature extract from domain list # Build legit base feature legit_count_vectorized = sklearn.feature_extraction.text.CountVectorizer(analyzer='char', ngram_range=(3,5), min_df=1e-4, max_df=1.0) legit_domain_matrix = legit_count_vectorized.fit_transform(legit_domain['domain']) count_each_feature_legit_domain_matrix = np.log10(legit_domain_matrix.sum(axis=0).getA1()) feature_list_legit_domain = legit_count_vectorized.get_feature_names() sorted_count_each_feature_legit_domain = sorted(zip(feature_list_legit_domain, count_each_feature_legit_domain_matrix), key=operator.itemgetter(1), reverse=True) print '[*] Legit number ngram feature: %d' % len(sorted_count_each_feature_legit_domain) number_feature_show = 10 print '>> Top %d highest count ngram feature :' % number_feature_show print '%15s %10s' % ('ngram-feature', 'count') print '-' * 30 for ngram_feature, count in sorted_count_each_feature_legit_domain[:number_feature_show]: print '%15s %10f' % (ngram_feature, count) print '-' * 30 # Build dga base feature dga_count_vectorized = sklearn.feature_extraction.text.CountVectorizer( analyzer='char', ngram_range=(3,5), min_df=1e-4, max_df=1.0) dga_domain_matrix = dga_count_vectorized.fit_transform(dga_domain['domain']) count_each_feature_dga_domain_matrix = np.log10(dga_domain_matrix.sum(axis=0).getA1()) feature_list_dga_domain = dga_count_vectorized.get_feature_names() sorted_count_each_feature_dga_domain = sorted( zip(feature_list_dga_domain, count_each_feature_dga_domain_matrix), key=operator.itemgetter(1), reverse=True) print '[*] DGA number ngram feature: %d' % len(sorted_count_each_feature_dga_domain) number_feature_show = 10 print '>> Top %d highest count ngram feature :' % number_feature_show print '%15s %10s' % ('ngram-feature', 'count') print '-' * 30 for ngram_feature, count in sorted_count_each_feature_dga_domain[:number_feature_show]: print '%15s %10f' % (ngram_feature, count) print '-' * 30 # Build dictionary base feature # Load dictionary for calculate later print '[*] Loading dictionary from ~/data/dictionary.txt .....' dictionary_dataframe = pd.read_csv('./data/dictionary.txt', names=['word'], header=None, dtype={'word': np.str}, encoding='utf-8') # Preprocess dictionary_dataframe before use to build training data dictionary_dataframe = dictionary_dataframe[dictionary_dataframe['word'].map(lambda x: str(x).isalpha())] dictionary_dataframe = dictionary_dataframe.applymap(lambda x: str(x).strip().lower()) dictionary_dataframe = dictionary_dataframe.dropna() dictionary_dataframe = dictionary_dataframe.drop_duplicates() print '[*] Dictionary after preprocessing :' print dictionary_dataframe.head(n=10) # Build count_vectorizer for dictionary to calculate dictionary score for domain dictionary_count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(analyzer='char', ngram_range=(3, 5), min_df=1e-5, max_df=1.0) dictionary_domain_matrix = dictionary_count_vectorizer.fit_transform(dictionary_dataframe['word']) count_each_feature_dictionary_domain_matrix = np.log10(dictionary_domain_matrix.sum(axis=0).getA1()) feature_list_dictionary_domain = dictionary_count_vectorizer.get_feature_names() sorted_count_each_feature_dict_domain = sorted(zip(feature_list_dictionary_domain, count_each_feature_dictionary_domain_matrix), key=operator.itemgetter(1), reverse=True) print '[*] Dictionary number ngram feature: %d' % len(sorted_count_each_feature_dict_domain) print '>> Top %d highest count ngram feature :' % number_feature_show print '%15s %10s' % ('ngram-feature', 'count') print '-' * 30 for ngram_feature, count in sorted_count_each_feature_dict_domain[:number_feature_show]: print '%15s %10f' % (ngram_feature, count) print '-' * 30 def ngram_domain_score(domain): legit_score = count_each_feature_legit_domain_matrix * legit_count_vectorized.transform([domain]).T dict_score = count_each_feature_dictionary_domain_matrix * dictionary_count_vectorizer.transform([domain]).T dga_score = count_each_feature_dga_domain_matrix * dga_count_vectorized.transform([domain]).T print '>>>> Domain \'%s\' :\n $legit_score = %f\n $dict_score = %f\n $dga_score = %f' \ % (domain, legit_score, dict_score, dga_score) # Test domain_score in some popular domains ngram_domain_score(domain='google') ngram_domain_score(domain='facebook') ngram_domain_score(domain='vnexpress') ngram_domain_score(domain='tinhte') ngram_domain_score(domain='kenh14') ngram_domain_score(domain='zing') ngram_domain_score(domain='chiasenhac') # Calculate domain score for all domain dataframe['legit_score'] = count_each_feature_legit_domain_matrix * \ legit_count_vectorized.transform(dataframe['domain']).T dataframe['dict_score'] = count_each_feature_dictionary_domain_matrix * \ dictionary_count_vectorizer.transform(dataframe['domain']).T dataframe['dga_score'] = count_each_feature_dga_domain_matrix * \ dga_count_vectorized.transform(dataframe['domain']).T # Show divegence between legit and dict domain # dataframe['legit_score'] > dataframe['dict_score'] => more legit # otherwise => more dict dataframe['divegence_legit_dict'] = dataframe['legit_score'] - dataframe['dict_score'] # Show divegence betwwen legit and dga domain # dataframe['legit_score'] > dataframe['dga_score'] => more legit # otherwise => more dga dataframe['divegence_legit_dga'] = dataframe['legit_score'] - dataframe['dga_score'] # Domain more dictionary than web print '[*] Recognize the domains that are more dictionary than web through div_legit_dict' print dataframe.sort_values(by=['divegence_legit_dict'], ascending=True, kind='quicksort').head(n=10) # Domain more web than dictionary print '[*] Recognize the domains that are more web than dictionary through div_legit_dict' print dataframe.sort_values(by=['divegence_legit_dict'], ascending=False, kind='quicksort').head(n=10) # Domain more dga than legit print '[*] Recognize the domains that are more dga than legit through div_legit_dga' print dataframe.sort_values(by=['divegence_legit_dga'], ascending=True, kind='quicksort').head(n=10) # Domain more legit than dga print '[*] Recognize the domains that are more legit than dga through div_legit_dga' print dataframe.sort_values(by=['divegence_legit_dga'], ascending=False, kind='quicksort').head(n=10) # Visualize effect of divergence # Rebuild sub-dataframe dga_domain = dataframe[condition_dga_domain] legit_domain = dataframe[condition_legit_domain] banjori_domain = dataframe[condition_banjori_domain] corebot_domain = dataframe[condition_corebot_domain] cryptolocker_domain = dataframe[condition_cryptolocker_domain] dircrypt_domain = dataframe[condition_dircrypt_domain] kraken_domain = dataframe[condition_kraken_domain] locky_domain = dataframe[condition_locky_domain] pykspa_domain = dataframe[condition_pykspa_domain] qakbot_domain = dataframe[condition_qakbot_domain] ramdo_domain = dataframe[condition_ramdo_domain] ramnit_domain = dataframe[condition_ramnit_domain] simda_domain = dataframe[condition_simda_domain] print '[*] Analyze fields in dataframe :' print '>>> Legit domain' print dataframe[condition_legit_domain].describe() print '>>> Dga domain' print dataframe[condition_dga_domain].describe() print '-' * 30 # Finish analyze data and preprocessing step, next step I build # model RandomForest to training and classify dga domain randomforest_labels = ['legit', 'dga'] randomforest_multilabels = ['legit', 'banjori', 'corebot', 'cryptolocker', 'dircrypt', 'kraken', 'locky', 'pykspa', 'qakbot', 'ramdo', 'ramnit', 'simda'] # X -> dataset to training, y -> labels X = dataframe.as_matrix(['length', 'entropy', 'legit_score', 'dict_score', 'dga_score']) y = np.array(dataframe['bin_class']) y_multiclass = np.array(dataframe['class']) y_multiclass_binarize = label_binarize(y_multiclass, classes=randomforest_multilabels) randomforest_multilabels_binarize = label_binarize(randomforest_multilabels, classes=randomforest_multilabels) # RandomForest model to classify randomforest_clf = RandomForestClassifier(n_estimators=20, criterion='entropy', bootstrap=True, n_jobs=-1) X_train, X_test, y_train, y_test = train_test_split(X, y_multiclass, test_size=0.2) print '[*] Training data......' randomforest_clf.fit(X_train, y_train) # For multi class, convert y_test to bin y_test_bin = label_binarize(y_test, randomforest_clf.classes_) num_class = y_test_bin.shape[1] y_test_bin = np.array([x.astype(int) for x in y_test_bin]) # For transform index <-> class easier index_class_dict = {} class_index_dict = {} for idx, x in zip(range(len(randomforest_clf.classes_)), randomforest_clf.classes_): index_class_dict[idx] = x class_index_dict[x] = idx print '[*] Testing data.....' y_predict = randomforest_clf.predict(X_test) y_predict_prob = randomforest_clf.predict_proba(X_test) confusion_matrix = sklearn.metrics.confusion_matrix(y_test, y_predict, randomforest_multilabels) print '[*] Show confusion matrix :' print confusion_matrix print '[*] Analyzing result.......' show_confusion_matrix(confusion_matrix, randomforest_multilabels) importances_feature = zip(['length', 'entropy', 'legit_score', 'dict_score', 'dga_score'], randomforest_clf.feature_importances_) print '==>> Importance feature :' for feature, importance in importances_feature: print ' > Feature \'%s\'(entropy) = %f' % (feature, importance) print '[*] List class recognize by randomforest classifier :' print randomforest_clf.classes_ prob_perclass_dict = dict() name_to_index_dict = dict() index_to_name_dict = dict() for index, class_name in enumerate(randomforest_clf.classes_): prob_perclass_dict[class_name] = y_predict_prob[:,index] name_to_index_dict[class_name] = index index_to_name_dict[index] = class_name # Convert label to prob value y_test_true_table = [0 if label == 'legit' else 1 for label in y_test] y_test_true_table_multi = [1 if label == 'legit' else 0 for label in y_test] # For multiclass , calculate AUC micro fpr = {} tpr = {} roc_auc = {} for i in range(num_class): fpr[i], tpr[i], _ = roc_curve(y_test_bin[:,i], y_predict_prob[:,i]) roc_auc[i] = roc_auc_score(y_test_bin[:,i], y_predict_prob[:,i]) fpr['micro'], tpr['micro'], _ = roc_curve(y_test_bin.ravel(), y_predict_prob.ravel()) roc_auc['micro'] = roc_auc_score(y_test_bin.ravel(), y_predict_prob.ravel()) print '[*] AUC micro is %f' % roc_auc['micro'] # Train on whole dataframe print '[*] Training whole dataset.......' randomforest_clf.fit(X, y_multiclass) def quick_test(uri): domain = domain_extract(uri=uri) legit_score = count_each_feature_legit_domain_matrix * legit_count_vectorized.transform([domain]).T dict_score = count_each_feature_dictionary_domain_matrix * dictionary_count_vectorizer.transform([domain]).T dga_score = count_each_feature_dga_domain_matrix * dga_count_vectorized.transform([domain]).T vectorized_domain = [len(domain), entropy(domain), legit_score, dict_score, dga_score] print '>>>> Test domain \'%s\' : %s' % (uri, randomforest_clf.predict(vectorized_domain)[0]) # Test on some domain legit and dga print '[*] Test on some domain : ' quick_test('google.com.vn') quick_test('vnexpress.net') quick_test('tinhte.vn') quick_test('kenh14.vn') quick_test('40a43e61e56a5c218cf6c22aca27f7ee.org') quick_test('agabgtdhgsbspwsq.ru') quick_test('dantri.net') quick_test('axtopsbtntqnfdyk.ru') quick_test('ahamove.com.vn') quick_test('batqeodiji.com') quick_test('bdjhtgqhggicwrmy.ru') quick_test('melhlehkvxoxbqq.net') print '==>> Finish testing' # Save whole model print '[*] Save model to hard driver:' save_model_to_disk('random_forest_classifier_multiclass', randomforest_clf) plt.close() final_report = {'y': y_test_bin, 'labels': y_test, 'probs': prob_perclass_dict, 'epoch': 0, 'confusion_matrix': confusion_matrix, 'name_to_index': name_to_index_dict, 'index_to_name': index_to_name_dict, 'y_predict_prob': y_predict_prob} except KeyboardInterrupt: print '>>>>>>>> Terminating.....' sys.exit(0) except Exception, error: print '>>>>>>>> Cannot build model :(....' print traceback.print_exc() print 'Error occur : %s' % (str(error)) sys.exit(1)