def _pull_layers(self): """ Sets layers """ # pull layers from database conn = SQLConnector() jsonlist = conn.pull_best_results(attack=self.attack_type, num=5, verbose=False) if jsonlist: raise Exception('Hyper data does not exist for ' + self.attack_type) json = jsonlist[0] layersstr = json['layers'] # parse ints from string comma_index = layersstr.index(",") num1 = int(layersstr[:layersstr.index(",")]) layersstr = layersstr[comma_index + 1:] comma_index = layersstr.index(",") num2 = int(layersstr[:layersstr.index(",")]) layersstr = layersstr[comma_index + 1:] num3 = int(layersstr) self.layers = [num1, num2, num3]
def setup(self): """ Setups the GAN """ # TODO new method called from init opt passed print("Attack type: " + self.attack_type) conn = SQLConnector() data = conn.pull_kdd99(attack=self.attack_type, num=4000) dataframe = pd.DataFrame.from_records( data=data, columns=conn.pull_kdd99_columns(allQ=True)) # ========== # ENCODING # ========== # https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn d = defaultdict(LabelEncoder) fit = dataframe.apply( lambda x: d[x.name].fit_transform(x)) # fit is encoded dataframe dataset = fit.values # transform to ndarray # to visually judge encoded dataset print("Real encoded " + self.attack_type + " attacks:") print(dataset[:1]) # Set X as our input data and Y as our label self.X_train = dataset[:, 0:41].astype(float) Y_train = dataset[:, 41] # labels for data. 1 for valid attacks, 0 for fake (generated) attacks self.valid = np.ones((self.batch_size, 1)) self.fake = np.zeros((self.batch_size, 1))
def main(): conn = SQLConnector() data = np.asarray(conn.pull_evaluator_data(30000, 'satan')) dataframe = pd.DataFrame.from_records( data=data, columns=conn.pull_kdd99_columns(allQ=True)) features = dataframe.iloc[:, :41] attacks = dataframe.iloc[:, 41:] print(attacks.at[0, 'attack_type']) print(type(attacks.at[0, 'attack_type'])) for i in range(0, attacks.size): attacks.at[i, 'attack_type'] = util.attacks_to_num( attacks.at[i, 'attack_type']) # using 0 as the label for non-neptune data for i in range(0, attacks.size): if (attacks.at[i, 'attack_type'] == 16): attacks.at[i, 'attack_type'] = 1 else: attacks.at[i, 'attack_type'] = 0 print(attacks) d = defaultdict(LabelEncoder) encoded_features_df = features.apply(lambda x: d[x.name].fit_transform(x)) eval_dataset_df = encoded_features_df.join(attacks) eval_dataset_df = shuffle(eval_dataset_df) print(eval_dataset_df) #Print encoded values to a csv eval_dataset_df.to_csv('SatanAndNonsatan.csv', header=False, index=False)
def signal_handler(sig, frame): """ Catches Crl-C command to print from database before ending """ conn = SQLConnector() hypers = conn.read_hyper() # by epoch? gens = conn.read_gens() # by epoch? print("\n\nMYSQL DATA:\n==============") print("hypers " + str(hypers)) print("\ngens " + str(gens) + "\n") sys.exit(0) signal.signal(signal.SIGINT, signal_handler)
def train(self): """ Trains the GAN system """ # break condition for training (when diverging) loss_increase_count = 0 prev_g_loss = 0 conn = SQLConnector() idx = np.arange(self.batch_size) for epoch in range(self.max_epochs): #selecting batch_size random attacks from our training data #idx = np.random.randint(0, X_train.shape[0], batch_size) attacks = self.X_train[idx] # generate a matrix of noise vectors noise = np.random.normal(0, 1, (self.batch_size, 41)) # create an array of generated attacks gen_attacks = self.generator.predict(noise) # loss functions, based on what metrics we specify at model compile time d_loss_real = self.discriminator.train_on_batch( attacks, self.valid) d_loss_fake = self.discriminator.train_on_batch( gen_attacks, self.fake) d_loss = 0.5 * np.add(d_loss_real, d_loss_fake) # generator loss function g_loss = self.gan.train_on_batch(noise, self.valid) if epoch % 500 == 0: print("%d [D loss: %f, acc.: %.2f%%] [G loss: %f] [Loss change: %.3f, Loss increases: %.0f]" % (epoch, d_loss[0], 100 * d_loss[1], g_loss, g_loss - prev_g_loss, loss_increase_count))
def train(self): """ Trains the GAN system """ # break condition for training (when diverging) loss_increase_count = 0 prev_g_loss = 0 conn = SQLConnector() idx = np.arange(self.batch_size) ones = np.ones((self.batch_size, 1)) zeros = np.zeros((self.batch_size, 1)) for epoch in range(50000): # print('Epoch ({}/{})-------------------------------------------------'.format(epoch, self.max_epochs)) # selecting batch_size random attacks from our training data # idx = np.random.randint(0, X_train.shape[0], batch_size) attacks = self.X_train[idx] # generate a matrix of noise vectors noise = np.random.normal(0, 1, (self.batch_size, 41)) # create an array of generated attacks gen_attacks = self.generator.predict(attacks) # loss functions, based on what metrics we specify at model compile time d_loss_real = self.discriminator.train_on_batch( attacks, self.valid) d_loss_fake = self.discriminator.train_on_batch( gen_attacks, self.fake) d_loss = 0.5 * np.add(d_loss_real, d_loss_fake) # generator loss function g_loss = self.gan.train_on_batch(attacks, [gen_attacks, ones]) g_loss = self.gan.train_on_batch(attacks, [gen_attacks, ones]) if epoch % 499 == 0: print("%d [D loss: %f, acc.: %.2f%%] [G loss: %f]" % (epoch, d_loss[0], 100 * d_loss[1], g_loss[0])) print('Real attack:') print(attacks[150]) print('Reconstructed attack:') print(gen_attacks[150].round(3))
def main(argv): parser = argparse.ArgumentParser() parser.add_argument("--mode", "-m", type = str, dest = "mode", required = False, default = "show", help = "Whether to show or save the heatmap. Use -m show or -m save.") parser.add_argument("--save_dir", "-dir", type = str, dest = "save_dir", required = False, default = "figs", help = "Directory to save heatmap figures to, if any. Will be created if does not exist.") parser.add_argument("--num", "-n", type = str, dest = "num", required = False, default = 40000, help = "Number of samples to pull from the database.") parser.add_argument("--host", "-ht", type = str, dest = "host", required = False, default = "localhost", help = "Database host.") args = parser.parse_args() mode = args.mode save_dir = args.save_dir try: num = int(args.num) except Exception as e: print(e) host = args.host conn = SQLConnector(host = host) data = conn.pull_all_attacks(num, nodupes = True) columns = conn.pull_kdd99_columns() col_len = len(columns) - 1 dataframe = pd.DataFrame(data=data, columns=columns) # Using Tim's method. dataframe = dataframe.iloc[:, :col_len] print(type(columns)) # ========== # ENCODING # ========== # https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn d = defaultdict(LabelEncoder) fit = dataframe.apply(lambda x: d[x.name].fit_transform(x)) # fit is encoded dataframe dataset = fit.values # transform to ndarray print(dataset) print(dataset.size) #TODO: Figure out what the f**k the method actually takes as params # Using the Pandas .corr function. corr_matrix = fit.corr() correlation_heatmap(corr_matrix) correlation_matrix = np.zeros(shape=(col_len,col_len)) for i in range(1, len(columns) - 1): for j in range(0, len(columns) - 1): correlation_matrix[i, j] = correlation_ratio(dataset[:, i], dataset[:, j]) print(type(columns)) correlation_dataframe = pd.DataFrame(data = correlation_matrix, index = columns[:col_len], columns = columns[:col_len]) print(correlation_dataframe) print(correlation_matrix.shape) correlation_heatmap(correlation_dataframe, mode = mode, save_dir = save_dir, num = num)
def signal_handler(sig, frame): """ Catches Crl-C command to print from database before ending """ conn = SQLConnector()
def train(self): """ Trains the GAN system """ # break condition for training (when diverging) loss_increase_count = 0 prev_g_loss = 0 conn = SQLConnector() idx = np.arange(self.batch_size) for epoch in range(self.max_epochs): #selecting batch_size random attacks from our training data #idx = np.random.randint(0, X_train.shape[0], batch_size) attacks = self.X_train[idx] # generate a matrix of noise vectors noise = np.random.normal(0, 1, (self.batch_size, 41)) # create an array of generated attacks gen_attacks = self.generator.predict(noise) # loss functions, based on what metrics we specify at model compile time d_loss_real = self.discriminator.train_on_batch( attacks, self.valid) d_loss_fake = self.discriminator.train_on_batch( gen_attacks, self.fake) d_loss = 0.5 * np.add(d_loss_real, d_loss_fake) # generator loss function g_loss = self.gan.train_on_batch(noise, self.valid) if epoch % 500 == 0: print("%d [D loss: %f, acc.: %.2f%%] [G loss: %f] [Loss change: %.3f, Loss increases: %.0f]" % (epoch, d_loss[0], 100 * d_loss[1], g_loss, g_loss - prev_g_loss, loss_increase_count)) ''' # ====================== # Decoding attacks # ====================== if epoch % 20 == 0: decode = gen_attacks[:1] # take a slice from the ndarray that we want to decode #MAX QUESTION: Do we plan on changing the shape of this at some #point? If not just do #decode = gen_attacks[0] #decode_ints = decode.astype(int) #print("decoded floats ======= " + str(decode)) #print("decoded ints ======= " + str(decode_ints)) accuracy_threshold = 55 accuracy = (d_loss[1] * 100) if(accuracy > accuracy_threshold): # print out first result list_of_lists = util.decode_gen(decode) print(list_of_lists) # ?????? gennum = 1 # pickle modelnum = 1 layersstr = str(self.generator_layers[0]) + "," + str(self.generator_layers[1]) + "," + str(self.generator_layers[2]) attack_num = util.attacks_to_num(self.attack_type) # send all to database print(np.shape(list_of_lists)) for lis in list_of_lists: #print(len(lis)) conn.write(gennum=gennum, modelnum=modelnum, layersstr=layersstr, attack_type=attack_num, accuracy=accuracy, gen_list=lis) # peek at our results <<<<<<< HEAD self.writeOut(self, conn) def writeOut(self, conn): ======= ''' accuracy = (d_loss[1] * 100) layersstr = str(self.generator_layers[0]) + "," + str(self.generator_layers[1]) + "," + str( self.generator_layers[2]) attack_num = util.attacks_to_num(self.attack_type) conn.write_hypers(layerstr=layersstr, attack_encoded=attack_num, accuracy=accuracy) # TODO: Get the evaluation model implemented and replace the accuracy parameter with that metric # TODO: Log our generated attacks to the gens table # TODO: Refactor our sql methods with the new database structure # TODO: Add foreign key for attack type in hypers table '''
def main(): print() conn = SQLConnector() data = conn.pull_all_attacks(num=10000) dataframe = pd.DataFrame.from_records( data=data, columns=conn.pull_kdd99_columns(allQ=True)) d = defaultdict(LabelEncoder) features = dataframe.iloc[:, :41] attack_labels = dataframe.iloc[:, 41:] for i in range(0, attack_labels.size): attack_labels.at[i, 'attack_type'] = util.attacks_to_num( attack_labels.at[i, 'attack_type']) fit = features.apply(lambda x: d[x.name].fit_transform(x)) unbalanced_df = fit.join(attack_labels) balanced_df = unbalanced_df.copy(deep=True) gen_data = np.asarray(conn.read_gen_attacks_acc_thresh(.90, 1000)) gen_df = pd.DataFrame.from_records( gen_data, columns=conn.pull_kdd99_columns(allQ=True)) gen_df = gen_df.fillna(0) balanced_df = pd.concat([balanced_df, gen_df]) print(len(balanced_df)) unbalanced_array = unbalanced_df.values balanced_array = balanced_df.values # BEGIN LOOP # Create two identical multi-class classifiers, make sure their output dimensions match the number of classes in our data layers = [16, 32, 16] alpha = 0.1 dropout = 0.3 unb_labels = unbalanced_array[:, 41] [unb_classes, unb_counts] = np.unique(unb_labels, return_counts=True) print("Unique classes in unbalanced labels: ") print(unb_classes) print("Counts for the classes in unbalanced labels: ") print(unb_counts) unb_class_count = len(unb_classes) print("Number of classes in unbalanced dataset: " + str(unb_class_count)) bal_labels = balanced_array[:, 41] [bal_classes, bal_counts] = np.unique(bal_labels, return_counts=True) dummy_bal_labels = np_utils.to_categorical(bal_labels) bal_class_count = len(bal_classes) print("Number of classes in balanced dataset: " + str(bal_class_count)) print("Unique classes in balanced labels: ") print(bal_classes) print("Counts for the classes in balanced labels: ") print(bal_counts) for j in range(100): unbalanced_classifier = build_discriminator(layers, alpha, dropout, unb_class_count) balanced_classifier = build_discriminator(layers, alpha, dropout, bal_class_count) optimizer = Adam(.001) unbalanced_classifier.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy']) balanced_classifier.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy']) # encoding labels, classifier wants them in range 0 to num_classes unb_enc = LabelEncoder() bal_enc = LabelEncoder() unb_labels = unbalanced_array[:, 41] bal_labels = balanced_array[:, 41] unb_enc = unb_enc.fit(unb_labels) bal_enc = bal_enc.fit(bal_labels) unbalanced_array[:, 41] = unb_enc.transform(unbalanced_array[:, 41]) balanced_array[:, 41] = bal_enc.transform(balanced_array[:, 41]) [unb_classes, _] = np.unique(unbalanced_array[:, 41], return_counts=True) train_data = unbalanced_array[:, :41].astype(int) unb_cm = train(unbalanced_classifier, unbalanced_array, train_data) bal_cm = train(balanced_classifier, balanced_array, train_data) print("Metrics for iteration " + str(j)) # print("Confusion matrix of unbalanced: ") # print print("Accuracy of unbalanced: " + str(getmetrics(unb_cm))) # print("Confusion matrix of balanced: ") # print(bal_cm) print("Accuracy of balanced" + str(getmetrics(bal_cm))) print("Diff: " + str(getmetrics(bal_cm) - getmetrics(unb_cm)))
def setup(self): """ Setups the GAN """ # TODO new method called from init opt passed print("Attack type: " + self.attack_type) conn = SQLConnector() data = conn.pull_kdd99(attack=self.attack_type, num=5000) dataframe = pd.DataFrame.from_records( data=data, columns=conn.pull_kdd99_columns(allQ=True)) # ========== # ENCODING # ========== # https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn d = defaultdict(LabelEncoder) # Splitting the data from features and lablels. Want labels to be consistent with evaluator encoding, so # we use the utils attack_to_num function features = dataframe.iloc[:, :41] attack_labels = dataframe.iloc[:, 41:] for i in range(0, attack_labels.size): attack_labels.at[i, 'attack_type'] = util.attacks_to_num( attack_labels.at[i, 'attack_type']) features = features.apply( lambda x: d[x.name].fit_transform(x)) # fit is encoded dataframe # feature scaling, reccomended from github implementation self.scaler = MinMaxScaler(feature_range=(-1, 1)) scaled_features = self.scaler.fit_transform(features.astype(float)) scaled_df = pd.DataFrame(data=scaled_features) # Join the seperately encoded sections back into one dataframe dataframe = scaled_df.join(attack_labels) dataset = dataframe.values # transform to ndarray print(dataset) # TODO: Feature scaling? May be necessary. Has to be on a per-feature basis? # Splitting up the evaluation dataset. Should maybe be moved? eval_dataset = pd.read_csv('PortsweepAndNonportsweep.csv', header=None) eval_dataset = eval_dataset.values self.eval_dataset_X = eval_dataset[:, 0:41].astype(int) self.eval_dataset_Y = eval_dataset[:, 41] validationToTrainRatio = 0.05 validationSize = int(validationToTrainRatio * len(self.eval_dataset_X)) self.eval_validation_data = self.eval_dataset_X[:validationSize] self.eval_validation_labels = self.eval_dataset_Y[:validationSize] self.eval_dataset_X = self.eval_dataset_X[validationSize:] self.eval_dataset_Y = self.eval_dataset_Y[validationSize:] testToTrainRatio = 0.05 testSize = int(testToTrainRatio * len(self.eval_dataset_X)) self.eval_test_data = self.eval_dataset_X[:testSize] self.eval_test_labels = self.eval_dataset_Y[:testSize] self.eval_dataset_X = self.eval_dataset_X[testSize:] self.eval_dataset_Y = self.eval_dataset_Y[testSize:] # to visually judge encoded dataset print("Real encoded " + self.attack_type + " attacks:") print(dataset[:1]) # Set X as our input data and Y as our label self.X_train = dataset[:, 0:41].astype(float) Y_train = dataset[:, 41] # labels for data. 1 for valid attacks, 0 for fake (generated) attacks self.valid = np.ones((self.batch_size, 1)) self.fake = np.zeros((self.batch_size, 1))
def signal_handler(sig, frame): """ Catches Crl-C command to print from database before ending """ conn = SQLConnector() writeOut(conn) sys.exit(0) print("did it work?")
def train(self): """ Trains the GAN system """ # break condition for training (when diverging) loss_increase_count = 0 prev_g_loss = 0 conn = SQLConnector() idx = np.arange(self.batch_size) for epoch in range(self.max_epochs): #selecting batch_size random attacks from our training data #idx = np.random.randint(0, X_train.shape[0], batch_size) attacks = self.X_train[idx] # generate a matrix of noise vectors noise = np.random.normal(0, 1, (self.batch_size, 41)) # create an array of generated attacks gen_attacks = self.generator.predict(noise) # loss functions, based on what metrics we specify at model compile time c_loss_real = self.critic.train_on_batch(attacks, self.valid) c_loss_fake = self.critic.train_on_batch(gen_attacks, self.fake) d_loss = 0.5 * np.add(c_loss_real, c_loss_fake) for l in self.critic.layers: weights = l.get_weights() weights = [ np.clip(w, -self.clip_value, self.clip_value) for w in weights ] l.set_weights(weights) # generator loss function g_loss = self.gan.train_on_batch(noise, self.valid) if epoch % 500 == 0: print( "%d [D loss: %f, acc.: %.2f%%] [G loss: %f] [Loss change: %.3f, Loss increases: %.0f]" % (epoch, d_loss[0], 100 * d_loss[1], g_loss, g_loss - prev_g_loss, loss_increase_count)) gen_attacks = self.scaler.inverse_transform(gen_attacks) predicted_gen_attack_labels = self.evaluator.predict( gen_attacks).transpose().astype(int) gen_attack_labels = np.full(predicted_gen_attack_labels.shape, 1) print("Generated attack labels: ") print(gen_attack_labels) print("Predicted labels of generated attacks: ") print(predicted_gen_attack_labels) right = (predicted_gen_attack_labels == 1).sum() wrong = (predicted_gen_attack_labels != 1).sum() accuracy = (right / float(right + wrong)) print("5 generated attacks: ") print(gen_attacks[:5, :]) print() print("Accuracy of evaluator on generated data: %.4f " % accuracy) if accuracy > .50: conn.write_gens(gen_attacks, util.attacks_to_num(self.attack_type)) layersstr = str(self.generator_layers[0]) + "," + str( self.generator_layers[1]) + "," + str(self.generator_layers[2]) attack_num = util.attacks_to_num(self.attack_type) conn.write_hypers(layerstr=layersstr, attack_encoded=attack_num, accuracy=accuracy)
def setup(self): """ Setups the GAN """ # TODO new method called from init opt passed print("Attack type: " + self.attack_type) conn = SQLConnector() data = conn.pull_kdd99(attack=self.attack_type, num=5000) dataframe = pd.DataFrame.from_records(data=data, columns=conn.pull_kdd99_columns(allQ=True)) # ========== # ENCODING # ========== # https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn d = defaultdict(LabelEncoder) features = dataframe.iloc[:, :41] attack_labels = dataframe.iloc[:, 41:] for i in range(0, attack_labels.size): attack_labels.at[i, 'attack_type'] = util.attacks_to_num(attack_labels.at[i, 'attack_type']) fit = features.apply(lambda x: d[x.name].fit_transform(x)) # fit is encoded dataframe dataframe = fit.join(attack_labels) dataset = dataframe.values # transform to ndarray #TODO: Move this entire process outside of gan.py? creating the evaluation model may take time and doesn't need to be redone for every GAN model Moving this #TODO: and then handling evaluation and database uploading to another script (like in the automation script) may be more efficient #pulling and encoding data for evaluation model ''' eval_data = np.asarray(conn.pull_evaluator_data(1000000, self.attack_type)) eval_dataframe = pd.DataFrame.from_records(data=eval_data, columns=conn.pull_kdd99_columns(allQ=True)) encoded_eval_df = eval_dataframe.apply(lambda x: d[x.name].fit_transform(x)) ''' eval_dataset = pd.read_csv('PortsweepAndNonportsweep.csv', header=None) eval_dataset = eval_dataset.values self.eval_dataset_X = eval_dataset[:,0:41].astype(int) self.eval_dataset_Y = eval_dataset[:, 41] validationToTrainRatio = 0.05 validationSize = int(validationToTrainRatio * len(self.eval_dataset_X)) self.eval_validation_data = self.eval_dataset_X[:validationSize] self.eval_validation_labels = self.eval_dataset_Y[:validationSize] self.eval_dataset_X = self.eval_dataset_X[validationSize:] self.eval_dataset_Y = self.eval_dataset_Y[validationSize:] testToTrainRatio = 0.05 testSize = int(testToTrainRatio * len(self.eval_dataset_X)) self.eval_test_data = self.eval_dataset_X[:testSize] self.eval_test_labels = self.eval_dataset_Y[:testSize] self.eval_dataset_X = self.eval_dataset_X[testSize:] self.eval_dataset_Y = self.eval_dataset_Y [testSize:] #print(fit) # ========== # DECODING # ========== # print("===============================================") # print("decoded:") # print("===============================================") # decode_test = dataset[:5] # take a slice from the ndarray that we want to decode # decode_test_df = pd.DataFrame(decode_test, columns=conn.pull_kdd99_columns()) # turn that ndarray into a dataframe with correct column names and order # decoded = decode_test_df.apply(lambda x: d[x.name].inverse_transform(x)) # decode that dataframe # print(decoded) # to visually judge encoded dataset print("Real encoded " + self.attack_type + " attacks:") print(dataset[:1]) # Set X as our input data and Y as our label self.X_train = dataset[:, 0:41].astype(float) Y_train = dataset[:, 41] # labels for data. 1 for valid attacks, 0 for fake (generated) attacks self.valid = np.ones((self.batch_size, 1)) self.fake = np.zeros((self.batch_size, 1))