def run(max_epoch=50, nfolds=10, batch_size=128,verbose=0): """Run train/test on logistic regression model""" indata = data.get_data() # Extract data and labels X = [x[1] for x in indata] labels = [x[0] for x in indata] # Create feature vectors print("vectorizing data") ngram_vectorizer = feature_extraction.text.CountVectorizer(analyzer='char', ngram_range=(2, 2)) count_vec = ngram_vectorizer.fit_transform(X) max_features = count_vec.shape[1] # Convert labels to 0-1 y = [0 if x == 'benign' else 1 for x in labels] final_data = [] for fold in range(nfolds): print("fold %u/%u" % (fold+1, nfolds)) X_train, X_test, y_train, y_test, _, label_test = train_test_split(count_vec, y, labels, test_size=0.2) print('Build model...') model = build_model(max_features) print("Train...") X_train, X_holdout, y_train, y_holdout = train_test_split(X_train, y_train, test_size=0.05) best_iter = -1 best_auc = 0.0 out_data = {} for ep in range(max_epoch): model.fit(X_train.todense(), y_train, batch_size=batch_size, epochs=1,verbose=verbose) t_probs = model.predict_proba(X_holdout.todense(),verbose=verbose) t_auc = sklearn.metrics.roc_auc_score(y_holdout, t_probs) print('Epoch %d: auc = %f (best=%f)' % (ep, t_auc, best_auc)) if t_auc > best_auc: best_auc = t_auc best_iter = ep probs = model.predict_proba(X_test.todense(),verbose=verbose) out_data = {'y':y_test, 'labels': label_test, 'probs':probs, 'epochs': ep, 'confusion_matrix': sklearn.metrics.confusion_matrix(y_test, probs > .5)} print(sklearn.metrics.confusion_matrix(y_test, probs > .5)) else: # No longer improving...break and calc statistics if (ep-best_iter) > 5: break final_data.append(out_data) return final_data
def train(max_epoch=25, batch_size=128): indata = data.get_data() # Extract data and labels X = [x[1] for x in indata] labels = [x[0] for x in indata] # Generate a dictionary of valid characters all_chars = string.ascii_letters + string.digits + '-.' valid_chars = {x:idx+1 for idx, x in enumerate(all_chars)} print 'Build model...' model = build_model() maxlen = 256 # Convert characters to int and pad X = [[valid_chars[y] for y in x] for x in X] X = sequence.pad_sequences(X, maxlen=maxlen) # Convert labels to 0-1 y = [0 if x == 'benign' else 1 for x in labels] X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.05) best_iter = -1 best_auc = 0.0 for ep in range(max_epoch): model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=1) t_probs = model.predict_proba(X_holdout) t_auc = sklearn.metrics.roc_auc_score(y_holdout, t_probs) print 'Epoch %d: auc = %f (best=%f)' % (ep, t_auc, best_auc) if t_auc > best_auc: best_auc = t_auc best_iter = ep else: # No longer improving...break and calc statistics if (ep-best_iter) > 2: break return model
def run(max_epoch=25, nfolds=10, batch_size=128): """Run train/test on logistic regression model""" indata = data.get_data() # Extract data and labels X = [x[1] for x in indata] labels = [x[0] for x in indata] # Generate a dictionary of valid characters valid_chars = {x:idx+1 for idx, x in enumerate(set(''.join(X)))} max_features = len(valid_chars) + 1 maxlen = np.max([len(x) for x in X]) # Convert characters to int and pad X = [[valid_chars[y] for y in x] for x in X] X = sequence.pad_sequences(X, maxlen=maxlen) # Convert labels to 0-1 y = [0 if x == 'benign' else 1 for x in labels] final_data = [] nfolds_best_model = {'best_auc': 0.0, 'best_model': None} for fold in range(nfolds): print "fold %u/%u" % (fold+1, nfolds) X_train, X_test, y_train, y_test, _, label_test = train_test_split(X, y, labels, test_size=0.2) print 'Build model...' model = build_model(max_features, maxlen) print "Train..." X_train, X_holdout, y_train, y_holdout = train_test_split(X_train, y_train, test_size=0.05) best_iter = -1 best_auc = 0.0 out_data = {} for ep in range(max_epoch): model.fit(X_train, y_train, batch_size=batch_size, epochs=1) t_probs = model.predict_proba(X_holdout) t_auc = sklearn.metrics.roc_auc_score(y_holdout, t_probs) print '\nEpoch %d: auc = %f (best=%f)' % (ep, t_auc, best_auc) if t_auc > best_auc: best_auc = t_auc best_iter = ep probs = model.predict_proba(X_test) out_data = {'y':y_test, 'labels': label_test, 'probs':probs, 'epochs': ep, 'confusion_matrix': sklearn.metrics.confusion_matrix(y_test, probs > .5)} print '\n', sklearn.metrics.confusion_matrix(y_test, probs > .5) else: # No longer improving...break and calc statistics if (ep-best_iter) > 2: break final_data.append(out_data) nfolds_best_auc = nfolds_best_model['best_auc'] if best_auc > nfolds_best_auc: nfolds_best_model['best_auc'] = best_auc nfolds_best_model['best_model'] = model best_model = nfolds_best_model['best_model'] save_model(best_model, save_path) return final_data
def run(max_epoch=50, nfolds=10, batch_size=128): """Run train/test on logistic regression model""" indata = data.get_data() # Extract data and labels X = [x[1] for x in indata] labels = [x[0] for x in indata] # Create feature vectors print "vectorizing data" ngram_vectorizer = feature_extraction.text.CountVectorizer(analyzer='char', ngram_range=(2, 2)) X = ngram_vectorizer.fit_transform(X) max_features = X.shape[1] malware_labels = data.get_malware_labels(labels) all_Ys = data.expand_labels(labels) final_data = [] for fold in range(nfolds): print "fold %u/%u" % (fold+1, nfolds) train_test = train_test_split(X, labels, *all_Ys, test_size=0.2, stratify=labels) X_train, X_test, label_train, label_test, y_train, y_test = train_test[:6] dga_training_test = train_test[6:] all_Y_train = [y_train] for idx in range(0, len(dga_training_test), 2): all_Y_train.append(dga_training_test[idx]) print 'Build model...' model = build_model(max_features, num_targets=len(malware_labels) + 1) print "Train..." train_test = train_test_split(X_train, *all_Y_train, test_size=0.05, stratify=label_train) X_train, X_holdout, y_train, y_holdout = train_test[:4] dga_training_test = train_test[4:] all_Y_train = [y_train] for idx in range(0, len(dga_training_test), 2): all_Y_train.append(dga_training_test[idx]) best_iter = -1 best_auc = 0.0 out_data = {} for ep in range(max_epoch): model.fit(X_train.todense(), data.y_list_to_dict(all_Y_train), batch_size=batch_size, epochs=1) t_probs = model.predict(X_holdout.todense())[0] t_auc = sklearn.metrics.roc_auc_score(y_holdout, t_probs) print 'Epoch %d: auc = %f (best=%f)' % (ep, t_auc, best_auc) if t_auc > best_auc: best_auc = t_auc best_iter = ep probs = model.predict(X_test.todense())[0] out_data = {'y':y_test, 'labels': label_test, 'probs':probs, 'epochs': ep, 'confusion_matrix': sklearn.metrics.confusion_matrix(y_test, probs > .5)} print sklearn.metrics.confusion_matrix(y_test, probs > .5) else: # No longer improving...break and calc statistics if (ep-best_iter) > 5: break final_data.append(out_data) return final_data
def run(max_epoch=25, nfolds=10, batch_size=128): """Run train/test on logistic regression model""" indata = data.get_data() # Extract data and labels X = [x[1] for x in indata] labels = [x[0] for x in indata] # Generate a dictionary of valid characters valid_chars = {x:idx+1 for idx, x in enumerate(set(''.join(X)))} #max_features = len(valid_chars) + 1 #maxlen = np.max([len(x) for x in X]) max_features = 100 maxlen = 256 # Convert characters to int and pad X = [[valid_chars[y] for y in x] for x in X] X = sequence.pad_sequences(X, maxlen=maxlen) # Convert labels to 0-1 y = [0 if x == 'benign' else 1 for x in labels] final_data = [] for fold in range(nfolds): print "fold %u/%u" % (fold+1, nfolds) X_train, X_test, y_train, y_test, _, label_test = train_test_split(X, y, labels, test_size=0.2) print 'Build model...' model = build_model(max_features, maxlen) print "Train..." X_train, X_holdout, y_train, y_holdout = train_test_split(X_train, y_train, test_size=0.05) best_iter = -1 best_auc = 0.0 out_data = {} for ep in range(max_epoch): model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=1) t_probs = model.predict_proba(X_holdout) t_auc = sklearn.metrics.roc_auc_score(y_holdout, t_probs) print 'Epoch %d: auc = %f (best=%f)' % (ep, t_auc, best_auc) if t_auc > best_auc: best_auc = t_auc best_iter = ep probs = model.predict_proba(X_test) out_data = {'y':y_test, 'labels': label_test, 'probs':probs, 'epochs': ep, 'confusion_matrix': sklearn.metrics.confusion_matrix(y_test, probs > .5)} print sklearn.metrics.confusion_matrix(y_test, probs > .5) else: # No longer improving...break and calc statistics if (ep-best_iter) > 2: break final_data.append(out_data) return final_data
def run(max_epoch=15, nfolds=10, batch_size=128): """Run train/test on logistic regression model""" indata = data.get_data() # Extract data and labels X, labels = zip(*indata) # Generate a dictionary of valid characters valid_chars = {x: idx + 1 for idx, x in enumerate(set(''.join(X)))} max_features = len(valid_chars) + 1 maxlen = np.max([len(x) for x in X]) # Convert characters to int and pad X = [[valid_chars[y] for y in x] for x in X] X = sequence.pad_sequences(X, maxlen=maxlen) # Convert labels to 0-1 y = np.asarray([0 if x == 'benign' else 1 for x in labels]) final_data = [] for fold in range(nfolds): print('fold {}/{}'.format(fold + 1, nfolds)) X_train, X_test, y_train, y_test, _, label_test = train_test_split( X, y, labels, test_size=0.2) print('Build model...') model = build_model(max_features, maxlen) print('Train...') X_train, X_holdout, y_train, y_holdout = train_test_split( X_train, y_train, test_size=0.05) best_iter = -1 best_auc = 0.0 out_data = {} for ep in range(max_epoch): model.fit(X_train, y_train, batch_size=batch_size, epochs=1) t_probs = model.predict_proba(X_holdout) t_auc = roc_auc_score(y_holdout, t_probs) print('Epoch {}: auc = {} (best={})'.format(ep, t_auc, best_auc)) if t_auc > best_auc: best_auc = t_auc best_iter = ep probs = model.predict_proba(X_test) out_data = { 'y': y_test, 'labels': label_test, 'probs': probs, 'epochs': ep, 'confusion_matrix': confusion_matrix(y_test, probs > .5) } print(confusion_matrix(y_test, probs > .5)) else: # No longer improving...break and calc statistics if (ep - best_iter) > 2: break final_data.append(out_data) return final_data
def run(max_epoch=2, nfolds=10, batch_size=128): """Run train/test on logistic regression model""" indata = data.get_data() # Extract data and labels X = [x[1] for x in indata] labels = [x[0] for x in indata] # add realdomins to the vecrization processing realdomains = [] with open('realtopdomains.txt', 'r') as f: realdomains = [l for l in f.readlines()] lenofx = len(X) X += realdomains # Create feature vectors print "vectorizing data" ngram_vectorizer = feature_extraction.text.CountVectorizer(analyzer='char', ngram_range=(2, 2)) count_vec = ngram_vectorizer.fit_transform(X) """ # get real data's vector matrix to ensure have the same shape with model pickle.dump(count_vec[lenofx:, :], open('realtopdomain.pkl', 'w')) """ count_vec = count_vec[:lenofx, :] max_features = count_vec.shape[1] # Convert labels to 0-1 y = [0 if x == 'benign' else 1 for x in labels] final_data = [] accuracy = 0.0 recall = 0.0 f1 = 0.0 for fold in range(nfolds): print "fold %u/%u" % (fold + 1, nfolds) X_train, X_test, y_train, y_test, _, label_test = train_test_split( count_vec, y, labels, test_size=0.2) print 'Build model...' model = build_model(max_features) print "Train..." xi = X_train yi = y_train X_train, X_holdout, y_train, y_holdout = train_test_split( X_train, y_train, test_size=0.05) best_iter = -1 best_auc = 0.0 out_data = {} for ep in range(max_epoch): model.fit(X_train.todense(), y_train, batch_size=batch_size, nb_epoch=1) t_probs = model.predict_proba(X_holdout.todense()) t_auc = sklearn.metrics.roc_auc_score(y_holdout, t_probs) print 'Epoch %d: auc = %f (best=%f)' % (ep, t_auc, best_auc) if t_auc > best_auc: """ best_auc = t_auc best_iter = ep probs = model.predict_proba(X_test.todense()) pre = model.predict_classes(X_test.todense()) out_data = {'y':y_test, 'labels': label_test, 'probs':probs, 'epochs': ep, 'confusion_matrix': sklearn.metrics.confusion_matrix(y_test, probs > .5)} """ #recall = sklearn.metrics.recall_score(y_test, pre) #f1 = sklearn.metrics.f1_score(y_test, pre) else: # No longer improving...break and calc statistics if (ep - best_iter) > 5: break pre_i = model.predict_classes(xi.todense()) accuracy = sklearn.metrics.accuracy_score(yi, pre_i) print '\n accurracy: %f' % (accuracy) #final_data.append(out_data) model.save('bigramMode.h5') return final_data
def run(max_epoch=50, nfolds=10, batch_size=128): """Run train/test on logistic regression model""" indata = data.get_data() # Extract data and labels X = [x[1] for x in indata] labels = [x[0] for x in indata] # Create feature vectors print "vectorizing data" ngram_vectorizer = feature_extraction.text.CountVectorizer(analyzer='char', ngram_range=(2, 2)) count_vec = ngram_vectorizer.fit_transform(X) max_features = count_vec.shape[1] # Convert labels to 0-1 y = [0 if x == 'benign' else 1 for x in labels] final_data = [] nfolds_best_model = {'best_auc': 0.0, 'best_model': None} for fold in range(nfolds): print "fold %u/%u" % (fold+1, nfolds) X_train, X_test, y_train, y_test, _, label_test = train_test_split(count_vec, y, labels, test_size=0.2) print 'Build model...' model = build_model(max_features) print "Train..." X_train, X_holdout, y_train, y_holdout = train_test_split(X_train, y_train, test_size=0.05) best_iter = -1 best_auc = 0.0 out_data = {} for ep in range(max_epoch): model.fit(X_train.todense(), y_train, batch_size=batch_size, epochs=1) t_probs = model.predict_proba(X_holdout.todense()) t_auc = sklearn.metrics.roc_auc_score(y_holdout, t_probs) print '\nEpoch %d: auc = %f (best=%f)' % (ep, t_auc, best_auc) if t_auc > best_auc: best_auc = t_auc best_iter = ep probs = model.predict_proba(X_test.todense()) out_data = {'y':y_test, 'labels': label_test, 'probs':probs, 'epochs': ep, 'confusion_matrix': sklearn.metrics.confusion_matrix(y_test, probs > .5)} print '\n', sklearn.metrics.confusion_matrix(y_test, probs > .5) else: # No longer improving...break and calc statistics if (ep-best_iter) > 5: break final_data.append(out_data) nfolds_best_auc = nfolds_best_model['best_auc'] if best_auc > nfolds_best_auc: nfolds_best_model['best_auc'] = best_auc nfolds_best_model['best_model'] = model best_model = nfolds_best_model['best_model'] save_model(best_model, save_path) # print 'final_data: ', final_data return final_data
def run(max_epoch=25, nfolds=10, batch_size=128, savemodel=False): """Run train/test on logistic regression model""" indata = data.get_data() # Extract data and labels X = [x[1] for x in indata] labels = [x[0] for x in indata] # Generate a dictionary of valid characters valid_chars = {x: idx + 1 for idx, x in enumerate(set(''.join(X)))} max_features = len(valid_chars) + 1 maxlen = np.max([len(x) for x in X]) # pickle encoding params pickle.dump(valid_chars, open(VALIDCHAR_FILE, 'wb')) pickle.dump(maxlen, open(MAXLEN_FILE, 'wb')) # Convert characters to int and pad X = [[valid_chars[y] for y in x] for x in X] X = sequence.pad_sequences(X, maxlen=maxlen) # Convert labels to 0-1 y = [0 if x == 'benign' else 1 for x in labels] final_data = [] for fold in range(nfolds): print("fold %u/%u" % (fold + 1, nfolds)) X_train, X_test, y_train, y_test, _, label_test = train_test_split( X, y, labels, test_size=0.2) print('Build model...') model = build_model(max_features, maxlen) print("Train...") X_train, X_holdout, y_train, y_holdout = train_test_split( X_train, y_train, test_size=0.05) best_iter = -1 best_auc = 0.0 out_data = {} for ep in range(max_epoch): model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=1) t_probs = model.predict_proba(X_holdout) t_auc = sklearn.metrics.roc_auc_score(y_holdout, t_probs) print('Epoch %d: auc = %f (best=%f)' % (ep, t_auc, best_auc)) if t_auc > best_auc: best_auc = t_auc best_iter = ep probs = model.predict_proba(X_test) out_data = { 'y': y_test, 'labels': label_test, 'probs': probs, 'epochs': ep, 'confusion_matrix': sklearn.metrics.confusion_matrix(y_test, probs > .5) } # print(sklearn.metrics.confusion_matrix(y_test, probs > .5)) # print("{} {}".format(__name__, "breaking prematurely")) break else: # No longer improving...break and calc statistics if (ep - best_iter) > 2: break final_data.append(out_data) if savemodel: model.save(MODEL_FILE) return final_data
def run(max_epoch=25, nfolds=10, batch_size=128): """Run train/test on logistic regression model""" indata = data.get_data() # Extract data and labels X = [x[1] for x in indata] labels = [x[0] for x in indata] # Generate a dictionary of valid characters valid_chars = {x: idx + 1 for idx, x in enumerate(set(''.join(X)))} with open('dict.json', 'w') as f: f.write(json.dumps(valid_chars)) max_features = len(valid_chars) + 1 maxlen = np.max([len(x) for x in X]) # Convert characters to int and pad X = [[valid_chars[y] for y in x] for x in X] X = sequence.pad_sequences(X, maxlen=maxlen) # Convert labels to 0-1 y = [0 if x == 'benign' else 1 for x in labels] final_data = [] tb = TensorBoard(log_dir='./logs', histogram_freq=0, batch_size=32, write_graph=True, write_grads=False, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None) for fold in range(nfolds): print "fold %u/%u" % (fold + 1, nfolds) X_train, X_test, y_train, y_test, _, label_test = train_test_split( X, y, labels, test_size=0.2) print 'Build model...' model = build_model(max_features, maxlen) print "Train..." X_train, X_holdout, y_train, y_holdout = train_test_split( X_train, y_train, test_size=0.05) best_iter = -1 best_auc = 0.0 out_data = {} for ep in range(max_epoch): model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=1, callbacks=[tb]) t_probs = model.predict_proba(X_holdout) t_auc = sklearn.metrics.roc_auc_score(y_holdout, t_probs) y_est = [1 if x > 0.5 else 0 for x in t_probs] f1 = sklearn.metrics.f1_score(y_holdout, y_est) print 'Epoch %d: auc = %f (best=%f) f1 = %f' % (ep, t_auc, best_auc, f1) if t_auc > best_auc: best_auc = t_auc best_iter = ep probs = model.predict_proba(X_test) out_data = { 'y': y_test, 'labels': label_test, 'probs': probs, 'epochs': ep, 'confusion_matrix': sklearn.metrics.confusion_matrix(y_test, probs > .5) } print sklearn.metrics.confusion_matrix(y_test, probs > .5) else: # No longer improving...break and calc statistics if (ep - best_iter) > 2: break final_data.append(out_data) model.save_weights('model.hdf5') with open('model.json', 'w') as f: f.write(model.to_json()) return final_data
def run(max_epoch=25, nfolds=10, batch_size=128): """Run train/test on logistic regression model""" indata = data.get_data() # Extract data and labels X = [x[1] for x in indata] labels = [x[0] for x in indata] # Generate a dictionary of valid characters valid_chars = {x: idx + 1 for idx, x in enumerate(set(''.join(X)))} max_features = len(valid_chars) + 1 maxlen = np.max([len(x) for x in X]) # Convert characters to int and pad X = [[valid_chars[y] for y in x] for x in X] X = sequence.pad_sequences(X, maxlen=maxlen) malware_labels = data.get_malware_labels(labels) all_Ys = data.expand_labels(labels) final_data = [] for fold in range(nfolds): print "fold %u/%u" % (fold + 1, nfolds) train_test = train_test_split(X, labels, *all_Ys, test_size=0.2, stratify=labels) X_train, X_test, label_train, label_test, y_train, y_test = train_test[: 6] dga_training_test = train_test[6:] all_Y_train = [y_train] for idx in range(0, len(dga_training_test), 2): all_Y_train.append(dga_training_test[idx]) print 'Build model...' model = build_model(max_features, maxlen, num_targets=len(malware_labels) + 1) print "Train..." train_test = train_test_split(X_train, *all_Y_train, test_size=0.05, stratify=label_train) X_train, X_holdout, y_train, y_holdout = train_test[:4] dga_training_test = train_test[4:] all_Y_train = [y_train] for idx in range(0, len(dga_training_test), 2): all_Y_train.append(dga_training_test[idx]) best_iter = -1 best_auc = 0.0 out_data = {} for ep in range(max_epoch): model.fit(X_train, data.y_list_to_dict(all_Y_train), batch_size=batch_size, epochs=1) t_probs = model.predict(X_holdout)[0] t_auc = sklearn.metrics.roc_auc_score(y_holdout, t_probs) print 'Epoch %d: auc = %f (best=%f)' % (ep, t_auc, best_auc) if t_auc > best_auc: best_auc = t_auc best_iter = ep probs = model.predict(X_test)[0] out_data = { 'y': y_test, 'labels': label_test, 'probs': probs, 'epochs': ep, 'confusion_matrix': sklearn.metrics.confusion_matrix(y_test, probs > .5) } print sklearn.metrics.confusion_matrix(y_test, probs > .5) else: # No longer improving...break and calc statistics if (ep - best_iter) > 2: break final_data.append(out_data) return final_data
def run(max_epoch=30, nfolds=10, batch_size=128): """Run train/test on logistic regression model""" indata = data.get_data() # Extract data and labels X = [x[1] for x in indata] labels = [x[0] for x in indata] # Create feature vectors print "vectorizing data" ngram_vectorizer = feature_extraction.text.CountVectorizer(analyzer='char', ngram_range=(2, 2)) count_vec = ngram_vectorizer.fit_transform(X) max_features = count_vec.shape[1] #count_vec = np.expand_dims(count_vec, axis=2) # Convert labels to 0-1 y = [0 if x == 'benign' else 1 for x in labels] final_data = [] accuracy = 0.0 recall = 0.0 f1 = 0.0 for fold in range(nfolds): print "fold %u/%u" % (fold + 1, nfolds) X_train, X_test, y_train, y_test, _, label_test = train_test_split( count_vec, y, labels, test_size=0.2) for kernelSize in (500, ): print 'Build model...' model = build_model(max_features, kernelSize) print "Train..." X_train, X_holdout, y_train, y_holdout = train_test_split( X_train, y_train, test_size=0.05) best_iter = -1 best_auc = 0.0 out_data = {} for ep in range(max_epoch): model.fit(np.expand_dims(X_train.todense(), axis=2), y_train, batch_size=batch_size, nb_epoch=1) t_probs = model.predict_proba( np.expand_dims(X_holdout.todense(), axis=2)) t_auc = sklearn.metrics.roc_auc_score(y_holdout, t_probs) print 'Epoch %d: auc = %f (best=%f)\n' % (ep, t_auc, best_auc) pre = model.predict_classes( np.expand_dims(X_test.todense(), axis=2)) accuracyT = sklearn.metrics.accuracy_score(y_test, pre) recallT = sklearn.metrics.recall_score(y_test, pre) f1T = sklearn.metrics.f1_score(y_test, pre) print 'evaluate:', str(accuracyT), ' ', str(recallT), ' ', str( f1T), '\n' if t_auc > best_auc: best_auc = t_auc best_iter = ep probs = model.predict_proba( np.expand_dims(X_test.todense(), axis=2)) pre = model.predict_classes( np.expand_dims(X_test.todense(), axis=2)) out_data = { 'y': y_test, 'labels': label_test, 'probs': probs, 'epochs': ep, 'confusion_matrix': sklearn.metrics.confusion_matrix(y_test, probs > .5) } accuracy = sklearn.metrics.accuracy_score(y_test, pre) recall = sklearn.metrics.recall_score(y_test, pre) f1 = sklearn.metrics.f1_score(y_test, pre) else: # No longer improving...break and calc statistics if (ep - best_iter) > 5: break final_data.append(out_data) with open("f1result.txt", 'a') as f: f.write("##CNN result: \n") f.write("kernelsize: ") f.write(str(kernelSize)) f.write(":accuracy: ") f.write(str(accuracy)) f.write("; recall: ") f.write(str(recall)) f.write("; f1: ") f.write(str(f1)) f.write("\n") return final_data
def run(max_epoch=50, batch_size=128, cata_split=True, multi_class=False, ratio=None, data_cache=False): global filepath """Run train/test on logistic regression model""" print('method is LSTM(SLD) + One-Hot(TLD)') indata = data.get_data() # Extract data and labels randnum = 1 random.seed(randnum) random.shuffle(indata) X = [x[1] for x in indata] labels = [x[0] for x in indata] tops = [x[2] for x in indata] # One hot top domain one-hot features le = LabelEncoder() new_tops = le.fit_transform(tops) new_tops = new_tops.reshape(-1, 1) top_enc = OneHotEncoder(handle_unknown='ignore', sparse=False) top_enc.fit(new_tops) feature_top = top_enc.transform(new_tops) tld_feature_dimension = feature_top.shape[1] print('tld_feature_dimension(the number of top domain feature)', tld_feature_dimension) # Generate a dictionary of valid characters valid_chars = {x: idx + 1 for idx, x in enumerate(set(''.join(X)))} char_feature_dimension = len(valid_chars) + 1 max_seq_len = np.max([len(x) for x in X]) # Convert characters to int and pad X = [[valid_chars[y] for y in x] for x in X] X = sequence.pad_sequences(X, maxlen=max_seq_len) print('X shape', X.shape) # Convert labels to 0-1 if not multi_class: y = [0 if x == 'benign' else 1 for x in labels] class_num = 2 else: DATA_FILE = 'class_dict.pkl' fopen = open(DATA_FILE, 'rb') label_dict = pickle.load(fopen) print('label_dict', label_dict) class_num = pickle.load(fopen) print('class_num', class_num) y = [] if ratio != None: max_class_num = int(class_num * ratio) + 1 print('max_class_num', max_class_num) for x in labels: if label_dict[x] > class_num * ratio: y.append(max_class_num) else: y.append(label_dict[x]) else: y = [label_dict[x] for x in labels] print('y', y[:100]) print('first_ y', Counter(y)) y = to_categorical(y, num_classes=class_num) # concatenate all the features X = np.concatenate((X, feature_top), axis=-1) print('X shape(after feature concatenate)', X.shape) final_data = [] print('cata_split = ', cata_split) fwrite = open('bigram_lstm_fe.log', 'w') if data_cache == True and os.path.isfile('bigram_lstm_fe_top_data.npz'): np_data = np.load('bigram_lstm_fe_top_data.npz') X_train = np_data['X_train'] y_train = np_data['y_train'] X_test = np_data['X_test'] y_test = np_data['y_test'] label_test = np_data['label_test'] else: if cata_split == False: print('cata split is false') X_train, X_test, y_train, y_test, _, label_test = train_test_split( X, y, labels, test_size=0.2, random_state=1) else: if ratio != None: X_train, X_test, y_train, y_test, label_train, label_test = split.train_test_split_as_catagory( X, y, labels, cata=max_class_num) else: X_train, X_test, y_train, y_test, label_train, label_test = split.train_test_split_as_catagory( X, y, labels, cata='symmi') print('X_train shape', X_train.shape) print('X shape', X.shape) np.savez('bigram_lstm_fe_top_data.npz', X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, label_test=label_test) print('bigram_lstm_fe_top_data.npz has been saved') print('type X_train', type(X_train)) print('Build model...') model = build_model(char_feature_dimension, tld_feature_dimension, max_seq_len, multi_class, class_num) print("Train...") best_iter = -1 best_acc = 0.0 for ep in range(max_epoch): model.fit(X_train, y_train, batch_size=batch_size, epochs=1) if not multi_class: t_probs = model.predict_proba(X_test) t_acc = sklearn.metrics.accuracy_score(y_test, t_probs > .5) print('Epoch %d: acc = %f (best=%f)' % (ep, t_acc, best_acc)) fwrite.write('Epoch %d: acc = %f (best=%f)\n' % (ep, t_acc, best_acc)) probs = t_probs print('test confusion matrix') print(sklearn.metrics.confusion_matrix(y_test, probs > .5)) else: score = model.evaluate(X_test, y_test, verbose=0) t_acc = score[1] print('Epoch %d: acc = %f (best=%f)' % (ep, t_acc, best_acc)) fwrite.write('Epoch %d: acc = %f (best=%f)\n' % (ep, t_acc, best_acc)) if t_acc > best_acc: best_acc = t_acc best_iter = ep model.save(filepath) opt_model = model print('newest model has been saved') fwrite.close()
def run(max_epoch=25, nfolds=1, batch_size=1024 * 4): """Run train/test on model""" indata = data.get_data() domain_list = pd.read_csv("proxy_log_04_10.csv") domain_list = domain_list.dropna() # Extract data and labels X = [x[1] for x in indata] labels = [x[0] for x in indata] # Generate a dictionary of valid characters valid_chars = {x: idx + 1 for idx, x in enumerate(set(''.join(X)))} max_features = len(valid_chars) + 1 maxlen = np.max([len(x) for x in X]) #save the valid_chars as pickle file #valid_chars_pkl = open('valid_chars.pkl','wb') #pickle.dump(valid_chars, valid_chars_pkl) # Convert characters to int and pad X = [[valid_chars[y] for y in x] for x in X] X = sequence.pad_sequences(X, maxlen=maxlen) print max_features, maxlen domain_list['tld'] = domain_list['domain'].apply( lambda x: tldextract.extract(x).domain) test_data = domain_list['tld'].tolist() test_data1 = [[valid_chars[c] for c in x] for x in test_data] test_data2 = sequence.pad_sequences(test_data1, maxlen=maxlen) # Convert labels to 0-1 y = [0 if x == 'benign' else 1 for x in labels] print 'length of y' print len(y) final_data = [] for fold in range(nfolds): print "fold %u/%u" % (fold + 1, nfolds) X_train, X_test, y_train, y_test, _, label_test = train_test_split( X, y, labels, test_size=0.2) print 'Build model...' model = build_model(max_features, maxlen) print "Train..." X_train, X_holdout, y_train, y_holdout = train_test_split( X_train, y_train, test_size=0.05) best_iter = -1 best_auc = 0.0 out_data = {} for ep in range(max_epoch): model.fit(X_train, y_train, batch_size=batch_size, epochs=1) t_probs = model.predict(X_holdout) t_auc = sklearn.metrics.roc_auc_score(y_holdout, t_probs) print 'Epoch %d: auc = %f (best=%f)' % (ep, t_auc, best_auc) if t_auc > best_auc: best_auc = t_auc best_iter = ep probs = model.predict(X_test) out_data = { 'y': y_test, 'labels': label_test, 'probs': probs, 'epochs': ep } # 'confusion_matrix': sklearn.metrics.confusion_matrix(y_test, probs > .5)} proxy_probs = model.predict(test_data2) for i in range(len(proxy_probs)): print test_data[i], proxy_probs[i, 0] # print sklearn.metrics.confusion_matrix(y_test, probs > .5) else: # No longer improving...break and calc statistics if (ep - best_iter) > 2: break model.summary() final_data.append(out_data) return final_data