def main():
    # TODO Handle options for input/output
    # TODO Add flags to determine what gets read
    sw = Timer()
    print("Flattening business data")
    sw.start()
    business_data = flatten_business_data('data/yelp_academic_dataset_business.json')
    sw.stop()
    print("Time: %f" % sw.elapsed)
    sw.reset()
    print("Outputting business data")
    sw.start()
    business_data.to_csv("processed_data/business_data.csv", index=False)
    sw.stop()
    print("Time: %f" % sw.elapsed)
    sw.reset()
    print("Flattening check in data")
    sw.start()
    checkin_data = flatten_checkin_data('data/yelp_academic_dataset_checkin.json')
    sw.stop()
    print("Time: %f" % sw.elapsed)
    sw.reset()
    print("Outputting check in data")
    sw.start()
    checkin_data.to_csv("processed_data/checkin_data.csv", index=False)
    sw.stop()
    print("Time: %f" % sw.elapsed)
    sw.reset()
    print("Flattening tip data")
    sw.start()
    tip_data = flatten_tip_data('data/yelp_academic_dataset_tip.json')
    sw.stop()
    print("Time: %f" % sw.elapsed)
    sw.reset()
    print("Outputting tip data")
    sw.start()
    tip_data.to_csv("processed_data/tip_data.csv", index=False)
    sw.stop()
    print("Time: %f" % sw.elapsed)
    sw.reset()
    print("Flattening review data")
    sw.start()
    review_data = flatten_review_data('data/yelp_academic_dataset_review.json')
    sw.stop()
    print("Time: %f" % sw.elapsed)
    sw.reset()
    print("Outputting review data")
    sw.start()
    review_data.to_csv("processed_data/review_data.csv", index=False)
    sw.stop()
    print("Time: %f" % sw.elapsed)
    sw.reset()
    def train_GAN(self, X_train, epochs, batch_size, batch_shape, name, gan_summary=False, tensorboard=True):
        """
        

        Parameters
        ----------
        X_train : TYPE
            DESCRIPTION.
        epochs : TYPE
            DESCRIPTION.
        batch_size : TYPE
            DESCRIPTION.
        batch_shape : TYPE
            DESCRIPTION.
        name : TYPE
            DESCRIPTION.
        gan_summary : TYPE, optional
            DESCRIPTION. The default is False.

        Returns
        -------
        TYPE
            DESCRIPTION.

        """
        self.input_shape = (batch_shape[1], batch_shape[2])
        generator, discriminator, gan_model = self.get_gan_model(name)
        if tensorboard:
            # Get the sessions graph
            #graph = K.get_session().graphs
            self.tensorboard_callback(models=[generator, discriminator, gan_model])
        if gan_summary:
            gan_model.summary()
            print('Note: In the GAN(combined model) Discriminator parameters are set to non-trainable because while training Generator, we do not train Discriminator!')
        steps_per_epoch = len(X_train)//batch_size
        chk = input('\n\nStart training y/N: ')
        if chk.lower()=='y':
            for epoch in range(1, epochs+1):
                #setting up timer class
                time =Timer()
                bg = BatchGenerator(X_train, batch_size=batch_size)
                for batch in range(1, steps_per_epoch):
                    #start the timer
                    time.start()
                    # X_reshaped is used data to get the output from generator model. X is the data in
                    # orignal dimensions e.g [batches,features], while X_reshaped is the data for LSTM 
                    # layer as LSTM layer 3D data so X_reshaped has dimensions [batches, timesteps, features]
                    #whereas x_t1 is the data at time t+1 or next batch
                    X, X_reshaped, x_t1 = bg.get_nextBatch(batch_shape)
                    #Getting the data for discrimnator training.
                    X_disc, Y_disc, X_fake = bg.get_disc_gan_data(generator, X, X_reshaped, x_t1)
                    """ train discriminator """
                    
                    metrics = discriminator.train_on_batch(X_disc, Y_disc)
                    self.history_batch['Disc_Loss'].append(metrics[0])
                    self.history_batch['Disc_Acc'].append(metrics[1])
                    #train generator
                    self.train_generator(generator, gan_model, X_reshaped, x_t1, X_fake)
                    #Getting total time taken by a batch
                    self.time_remain, self.time_taken = time.get_time_hhmmss(steps_per_epoch-batch)
                    self.info_out('batch', epoch, epochs, batch, steps_per_epoch)
                    
                #computing loss & accuracy over one epoch
                self.history_epoch['Disc_Loss'].append(sum(self.history_batch['Disc_Loss'])/steps_per_epoch)
                self.history_epoch['Disc_Acc'].append(sum(self.history_batch['Disc_Acc'])/steps_per_epoch)
                self.history_epoch['Gen_Loss'].append(sum(self.history_batch['Gen_Loss'])/steps_per_epoch)
                self.history_epoch['Gen_Acc'].append(sum(self.history_batch['Gen_Acc'])/steps_per_epoch)
                self.history_epoch['Batch_Data'].append(self.history_batch)
                self.info_out(which='epoch', epoch=epoch, total_time=time.get_total_time())
                self.ckpt_callback(epoch, [generator, discriminator, gan_model])
        elif chk.lower()=='n':
            SystemExit
        return self.history_epoch
Example #3
0
def train_gan(X, path, epochs=2, batch_size=64):
    if path[-1] != '/':
        path += '/'
    os.makedirs(path + 'checkpoints', exist_ok=True)

    X_benignware = X[0]
    X_malware = X[1]

    #Creating Models
    input_shape = X_malware.shape[1]
    opt = Adam(learning_rate=0.001)
    generator, discriminator, gan = create_gan(input_shape,
                                               path,
                                               opt,
                                               name='GAN')

    #creating callback for saving checkpoints
    cb = Callbacks(ckpt_path=path + 'checkpoints',
                   models=[generator, discriminator, gan])

    steps_per_epoch = X_malware.shape[0] // batch_size
    history_epochs = {
        'Disc_Loss': [],
        'Disc_Acc': [],
        'Gen_Loss': [],
        'Gen_Acc': [],
        'Batch_Data': []
    }

    chk = input('\n\nStart training GANs (y/N): ')
    if chk.lower() == 'y':
        for epoch in range(1, epochs + 1):
            time = Timer()
            history_batch = {
                'Disc_Loss': [],
                'Disc_Acc': [],
                'Gen_Loss': [],
                'Gen_Acc': [],
                'Batch_Data': []
            }
            bg = BatchGenerator(X_malware, batch_size, batch_shape=None)
            for batch in range(1, steps_per_epoch + 1):
                #start the timer
                time.start()
                #Getting next batch
                batch_malware = bg.get_nextBatch()
                #generating features from Generator
                gen_features = generator.predict(batch_malware)
                #getting samples from benignware for concatenation
                batch_benignware = X_benignware[np.random.randint(
                    0, X_benignware.shape[0], size=batch_size)]
                #converting sparse to dense
                batch_benignware = batch_benignware.todense()

                # #Concatenating Benignware and generated features.
                # x = np.concatenate((batch_benignware, gen_features))

                #Generating labels for x
                #disc_y = np.zeros(2*batch_size)
                # disc_y[:batch_size] = 1.0

                #Train Discriminator
                #disc_metric = discriminator.train_on_batch(x, disc_y)

                #Generating labels for x
                disc_y = np.full(batch_size, 0)
                #Train Discriminator
                disc_metric0 = discriminator.train_on_batch(
                    gen_features, disc_y)
                #Generating labels for x
                disc_y = np.full(batch_size, 1)
                #Train Discriminator
                disc_metric1 = discriminator.train_on_batch(
                    batch_benignware, disc_y)
                disc_metric = [(disc_metric0[0] + disc_metric1[0]) / 2,
                               (disc_metric0[1] + disc_metric1[1]) / 2]

                history_batch['Disc_Loss'].append(disc_metric[0])
                history_batch['Disc_Acc'].append(disc_metric[1])

                #Train Generator using GAN model
                y_gen = np.ones(batch_size)
                gen_metric = gan.train_on_batch(batch_malware, y_gen)
                gen_metric = gan.train_on_batch(batch_malware, y_gen)
                gen_metric = gan.train_on_batch(batch_malware, y_gen)
                gen_metric = gan.train_on_batch(batch_malware, y_gen)
                gen_metric = gan.train_on_batch(batch_malware, y_gen)
                history_batch['Gen_Loss'].append(gen_metric[0])
                history_batch['Gen_Acc'].append(gen_metric[1])

                #Printing info of batch
                time_remain, time_taken = time.get_time_hhmmss(
                    steps_per_epoch - batch)
                timers = (time_remain, time_taken)
                history = (history_batch, history_epochs)
                info_out('batch', history, timers, epoch, epochs, batch,
                         steps_per_epoch)

            #computing loss & accuracy over one epoch
            history_epochs['Disc_Loss'].append(
                sum(history_batch['Disc_Loss']) / steps_per_epoch)
            history_epochs['Disc_Acc'].append(
                sum(history_batch['Disc_Acc']) / steps_per_epoch)
            history_epochs['Gen_Loss'].append(
                sum(history_batch['Gen_Loss']) / steps_per_epoch)
            history_epochs['Gen_Acc'].append(
                sum(history_batch['Gen_Acc']) / steps_per_epoch)
            history_epochs['Batch_Data'].append(history_batch)

            history = (history_batch, history_epochs)
            info_out(which='epoch',
                     history=history,
                     epoch=epoch,
                     total_time=time.get_total_time())
            cb.ckpt_callback(epoch, history_epochs)
    elif chk.lower() == 'n':
        SystemExit
    #Writing history to disk
    dump(history_epochs, open(path + 'checkpoints/history.obj', 'wb'))
    return history_epochs