def main(): # TODO Handle options for input/output # TODO Add flags to determine what gets read sw = Timer() print("Flattening business data") sw.start() business_data = flatten_business_data('data/yelp_academic_dataset_business.json') sw.stop() print("Time: %f" % sw.elapsed) sw.reset() print("Outputting business data") sw.start() business_data.to_csv("processed_data/business_data.csv", index=False) sw.stop() print("Time: %f" % sw.elapsed) sw.reset() print("Flattening check in data") sw.start() checkin_data = flatten_checkin_data('data/yelp_academic_dataset_checkin.json') sw.stop() print("Time: %f" % sw.elapsed) sw.reset() print("Outputting check in data") sw.start() checkin_data.to_csv("processed_data/checkin_data.csv", index=False) sw.stop() print("Time: %f" % sw.elapsed) sw.reset() print("Flattening tip data") sw.start() tip_data = flatten_tip_data('data/yelp_academic_dataset_tip.json') sw.stop() print("Time: %f" % sw.elapsed) sw.reset() print("Outputting tip data") sw.start() tip_data.to_csv("processed_data/tip_data.csv", index=False) sw.stop() print("Time: %f" % sw.elapsed) sw.reset() print("Flattening review data") sw.start() review_data = flatten_review_data('data/yelp_academic_dataset_review.json') sw.stop() print("Time: %f" % sw.elapsed) sw.reset() print("Outputting review data") sw.start() review_data.to_csv("processed_data/review_data.csv", index=False) sw.stop() print("Time: %f" % sw.elapsed) sw.reset()
def train_GAN(self, X_train, epochs, batch_size, batch_shape, name, gan_summary=False, tensorboard=True): """ Parameters ---------- X_train : TYPE DESCRIPTION. epochs : TYPE DESCRIPTION. batch_size : TYPE DESCRIPTION. batch_shape : TYPE DESCRIPTION. name : TYPE DESCRIPTION. gan_summary : TYPE, optional DESCRIPTION. The default is False. Returns ------- TYPE DESCRIPTION. """ self.input_shape = (batch_shape[1], batch_shape[2]) generator, discriminator, gan_model = self.get_gan_model(name) if tensorboard: # Get the sessions graph #graph = K.get_session().graphs self.tensorboard_callback(models=[generator, discriminator, gan_model]) if gan_summary: gan_model.summary() print('Note: In the GAN(combined model) Discriminator parameters are set to non-trainable because while training Generator, we do not train Discriminator!') steps_per_epoch = len(X_train)//batch_size chk = input('\n\nStart training y/N: ') if chk.lower()=='y': for epoch in range(1, epochs+1): #setting up timer class time =Timer() bg = BatchGenerator(X_train, batch_size=batch_size) for batch in range(1, steps_per_epoch): #start the timer time.start() # X_reshaped is used data to get the output from generator model. X is the data in # orignal dimensions e.g [batches,features], while X_reshaped is the data for LSTM # layer as LSTM layer 3D data so X_reshaped has dimensions [batches, timesteps, features] #whereas x_t1 is the data at time t+1 or next batch X, X_reshaped, x_t1 = bg.get_nextBatch(batch_shape) #Getting the data for discrimnator training. X_disc, Y_disc, X_fake = bg.get_disc_gan_data(generator, X, X_reshaped, x_t1) """ train discriminator """ metrics = discriminator.train_on_batch(X_disc, Y_disc) self.history_batch['Disc_Loss'].append(metrics[0]) self.history_batch['Disc_Acc'].append(metrics[1]) #train generator self.train_generator(generator, gan_model, X_reshaped, x_t1, X_fake) #Getting total time taken by a batch self.time_remain, self.time_taken = time.get_time_hhmmss(steps_per_epoch-batch) self.info_out('batch', epoch, epochs, batch, steps_per_epoch) #computing loss & accuracy over one epoch self.history_epoch['Disc_Loss'].append(sum(self.history_batch['Disc_Loss'])/steps_per_epoch) self.history_epoch['Disc_Acc'].append(sum(self.history_batch['Disc_Acc'])/steps_per_epoch) self.history_epoch['Gen_Loss'].append(sum(self.history_batch['Gen_Loss'])/steps_per_epoch) self.history_epoch['Gen_Acc'].append(sum(self.history_batch['Gen_Acc'])/steps_per_epoch) self.history_epoch['Batch_Data'].append(self.history_batch) self.info_out(which='epoch', epoch=epoch, total_time=time.get_total_time()) self.ckpt_callback(epoch, [generator, discriminator, gan_model]) elif chk.lower()=='n': SystemExit return self.history_epoch
def train_gan(X, path, epochs=2, batch_size=64): if path[-1] != '/': path += '/' os.makedirs(path + 'checkpoints', exist_ok=True) X_benignware = X[0] X_malware = X[1] #Creating Models input_shape = X_malware.shape[1] opt = Adam(learning_rate=0.001) generator, discriminator, gan = create_gan(input_shape, path, opt, name='GAN') #creating callback for saving checkpoints cb = Callbacks(ckpt_path=path + 'checkpoints', models=[generator, discriminator, gan]) steps_per_epoch = X_malware.shape[0] // batch_size history_epochs = { 'Disc_Loss': [], 'Disc_Acc': [], 'Gen_Loss': [], 'Gen_Acc': [], 'Batch_Data': [] } chk = input('\n\nStart training GANs (y/N): ') if chk.lower() == 'y': for epoch in range(1, epochs + 1): time = Timer() history_batch = { 'Disc_Loss': [], 'Disc_Acc': [], 'Gen_Loss': [], 'Gen_Acc': [], 'Batch_Data': [] } bg = BatchGenerator(X_malware, batch_size, batch_shape=None) for batch in range(1, steps_per_epoch + 1): #start the timer time.start() #Getting next batch batch_malware = bg.get_nextBatch() #generating features from Generator gen_features = generator.predict(batch_malware) #getting samples from benignware for concatenation batch_benignware = X_benignware[np.random.randint( 0, X_benignware.shape[0], size=batch_size)] #converting sparse to dense batch_benignware = batch_benignware.todense() # #Concatenating Benignware and generated features. # x = np.concatenate((batch_benignware, gen_features)) #Generating labels for x #disc_y = np.zeros(2*batch_size) # disc_y[:batch_size] = 1.0 #Train Discriminator #disc_metric = discriminator.train_on_batch(x, disc_y) #Generating labels for x disc_y = np.full(batch_size, 0) #Train Discriminator disc_metric0 = discriminator.train_on_batch( gen_features, disc_y) #Generating labels for x disc_y = np.full(batch_size, 1) #Train Discriminator disc_metric1 = discriminator.train_on_batch( batch_benignware, disc_y) disc_metric = [(disc_metric0[0] + disc_metric1[0]) / 2, (disc_metric0[1] + disc_metric1[1]) / 2] history_batch['Disc_Loss'].append(disc_metric[0]) history_batch['Disc_Acc'].append(disc_metric[1]) #Train Generator using GAN model y_gen = np.ones(batch_size) gen_metric = gan.train_on_batch(batch_malware, y_gen) gen_metric = gan.train_on_batch(batch_malware, y_gen) gen_metric = gan.train_on_batch(batch_malware, y_gen) gen_metric = gan.train_on_batch(batch_malware, y_gen) gen_metric = gan.train_on_batch(batch_malware, y_gen) history_batch['Gen_Loss'].append(gen_metric[0]) history_batch['Gen_Acc'].append(gen_metric[1]) #Printing info of batch time_remain, time_taken = time.get_time_hhmmss( steps_per_epoch - batch) timers = (time_remain, time_taken) history = (history_batch, history_epochs) info_out('batch', history, timers, epoch, epochs, batch, steps_per_epoch) #computing loss & accuracy over one epoch history_epochs['Disc_Loss'].append( sum(history_batch['Disc_Loss']) / steps_per_epoch) history_epochs['Disc_Acc'].append( sum(history_batch['Disc_Acc']) / steps_per_epoch) history_epochs['Gen_Loss'].append( sum(history_batch['Gen_Loss']) / steps_per_epoch) history_epochs['Gen_Acc'].append( sum(history_batch['Gen_Acc']) / steps_per_epoch) history_epochs['Batch_Data'].append(history_batch) history = (history_batch, history_epochs) info_out(which='epoch', history=history, epoch=epoch, total_time=time.get_total_time()) cb.ckpt_callback(epoch, history_epochs) elif chk.lower() == 'n': SystemExit #Writing history to disk dump(history_epochs, open(path + 'checkpoints/history.obj', 'wb')) return history_epochs