class VGG19_C:
    def __init__(self):
        self.covid_path = 'dataset/covid_dataset.csv'
        self.covid_image_path = 'dataset/covid_adjusted/'
        self.normal_path = 'dataset/normal_xray_dataset.csv'
        self.normal_image_path = 'dataset/normal_dataset/'
        self.head_count = 99
        self.test_ratio = 0.15
        self.shape = (224, 224, 3)
        self.folds = 5
        self.batch_size = 32
        self.epochs = 500
        self.verbose = 2 
        self.activation_optimizer = Adam(lr=0.0001, decay=1e-6)
        self.early_stop_criteria = EarlyStopping(patience=100, restore_best_weights=True)
        self.prior_model_path = 'prior_model.h5'

    def Generate_Model(self, params = 'default'):
        if params == 'default':
            shape = self.shape
            
        start_generate = datetime.datetime.now()
  
        model = tf.keras.Sequential()
        model.add(Conv2D(64, kernel_size=(3, 3), input_shape=self.shape, activation='relu', padding='same')) 
        model.add(Conv2D(64, kernel_size=(3, 3), input_shape=self.shape, activation='relu', padding='same'))
        
        model_to_transfer = self.load_model(self.prior_model_path)
        self.model_to_transfer = model_to_transfer

        for i, layer in enumerate(model_to_transfer.layers[0].layers[2:]):
            model.add(layer)

        
        print('model_to_transfer-feedforward')
        for i, layer in enumerate(model_to_transfer.layers[1:]):
            model.add(layer)
        
        for i, layer in enumerate(model.layers):
            if i > 1:
                layer.trainable = False

        opt = self.activation_optimizer
        model.compile(
                loss='categorical_crossentropy', 
                optimizer=opt, 
                metrics=['accuracy']
        )

        train_aug = ImageDataGenerator(
            rotation_range=20,
            width_shift_range=0.2,
            height_shift_range=0.2,
            horizontal_flip=True
        )
        
        
        self.train_aug = train_aug
        
        end_generate = datetime.datetime.now()
        self.generate_time = str(end_generate - start_generate)

        print('calculation time for ALL: {}'.format(self.generate_time))
        
        return model
    
    def Run_Model(self, params = 'default'):
        if params == 'default':
            folds = self.folds; batch_size = self.batch_size; epochs = self.epochs; 
            X_train = self.X_train; y_train = self.y_train; X_test = self.X_test; y_test = self.y_test
            model = self.model; train_aug = self.train_aug
            
        lst_perf_folds = []
        lst_perf_folds_evaluate = []
        lst_perf_folds_history = []
        lst_perf_folds_report = []
        start_run = datetime.datetime.now()

        kf = KFold(n_splits = folds, random_state = 1, shuffle = True)
        kf.get_n_splits(X_train)

        for fold, (train_index, validation_index) in enumerate(kf.split(X_train)):
            print('\n Fold %d' % (fold))
            start_iter = datetime.datetime.now()
            X_train_fold, X_validation = X_train[train_index], X_train[validation_index]
            y_train_fold, y_validation = y_train[train_index], y_train[validation_index]

            early_stop_criteria = self.early_stop_criteria
            history = model.fit(train_aug.flow(X_train_fold, y_train_fold, batch_size=batch_size),
                                validation_data=(X_validation, y_validation),
                                validation_steps=len(X_validation) / batch_size,
                                steps_per_epoch=len(X_train_fold) / batch_size,
                                epochs=epochs, verbose=self.verbose,
                                callbacks=[
                                early_stop_criteria
                                ]
            )  

            y_pred_test = model.predict(X_test, batch_size = batch_size)
            y_pred_train = model.predict(X_train, batch_size = batch_size)
            y_pred_train_fold = model.predict(X_train_fold, batch_size = batch_size)
            y_pred_validation = model.predict(X_validation, batch_size = batch_size)

            rep1 = classification_report(np.argmax(y_test, axis = 1), np.argmax(y_pred_test, axis = 1), output_dict = True)
            rep2 = classification_report(np.argmax(y_train, axis = 1), np.argmax(y_pred_train, axis = 1), output_dict = True)
            rep3 = classification_report(np.argmax(y_train_fold, axis = 1), np.argmax(y_pred_train_fold, axis = 1), output_dict = True)
            rep4 = classification_report(np.argmax(y_validation, axis = 1), np.argmax(y_pred_validation, axis = 1), output_dict = True)

            lst_perf_folds.append((rep1['accuracy'], rep2['accuracy'], rep3['accuracy'], rep4['accuracy']))
            lst_perf_folds_history.append(model.history.history)
            lst_perf_folds_report.append((rep1, rep2, rep3, rep4))

            evaluate_TEST = model.evaluate(X_test, y_test, verbose=0)
            evaluate_TRAIN = model.evaluate(X_train, y_train, verbose=0)
            evaluate_TRAIN_Fold = model.evaluate(X_train_fold, y_train_fold, verbose=0)
            evaluate_VALIDATION = model.evaluate(X_validation, y_validation, verbose=0)

            lst_perf_folds_evaluate.append((evaluate_TEST, evaluate_TRAIN, evaluate_TRAIN_Fold, evaluate_VALIDATION))

            end_iter = datetime.datetime.now()
            print('calculation time for iteration-{}: {}'.format(str(fold), str(end_iter - start_iter)))

        mean_Accuracy_TEST = round(np.mean(np.array(lst_perf_folds)[:, 0]), 4)
        self.mean_Accuracy_TEST = mean_Accuracy_TEST

        mean_Accuracy_TRAIN = round(np.mean(np.array(lst_perf_folds)[:, 1]), 4)
        self.mean_Accuracy_TRAIN = mean_Accuracy_TRAIN

        mean_Accuracy_TRAIN_Fold = round(np.mean(np.array(lst_perf_folds)[:, 2]), 4)

        mean_Accuracy_VALIDATION = round(np.mean(np.array(lst_perf_folds)[:, 3]), 4)
        self.mean_Accuracy_VALIDATION = mean_Accuracy_VALIDATION
        print('Avg-TEST Acc: {} ... Avg-VALIDATION Acc: {}'.format(mean_Accuracy_TEST, mean_Accuracy_VALIDATION))
        print('Avg-TRAIN Acc: {} ... Avg-TRAIN_Fold Acc: {}'.format(mean_Accuracy_TRAIN, mean_Accuracy_TRAIN_Fold))
        print('lst_perf_folds_evaluate: {}'.format(lst_perf_folds_evaluate))
        
        print('Support Check:')
        print('TEST: # of Images Normal: {} vs Covid-19: {}'.format(rep1['1']['support'], rep1['0']['support']))
        print('VALIDATION: # of Images Normal: {} vs Covid-19: {}'.format(rep4['1']['support'], rep4['0']['support']))
        print('TRAIN: # of Images Normal: {} vs Covid-19: {}'.format(rep2['1']['support'], rep2['0']['support']))
        print('TRAIN-Fold: # of Images Normal: {} vs Covid-19: {}'.format(rep3['1']['support'], rep3['0']['support']))   
        
        end_run = datetime.datetime.now()
        self.run_time = str(end_run - start_run)

        print('calculation time for RUN: {}'.format(self.run_time))
        print('Finished at {}'.format(datetime.datetime.now()))
        
    def Create_DataFrames(self, params = 'default'):
        if params == 'default':
            covid_path = self.covid_path; normal_path = self.normal_path; head_count = self.head_count
            
        covid_df = pd.read_csv(covid_path, usecols=['filename', 'finding'])
        normal_df = pd.read_csv(normal_path, usecols=['filename', 'finding'])
        normal_df = normal_df.head(head_count)
        
        self.covid_df = covid_df
        self.normal_df = normal_df

        return covid_df, normal_df

    def Fetch_Images(self, params = 'default' ):
        if params == 'default':
            covid_df = self.covid_df; normal_df = self.normal_df
            
        covid_images_lst = []
        covid_labels = []
        
        covid_image_path = self.covid_image_path
        normal_image_path = self.normal_image_path

        for index, row in covid_df.iterrows():
            filename = row['filename']
            label = row['finding']
            path = covid_image_path + filename

            image = cv2.imread(path)
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

            covid_images_lst.append(image)
            covid_labels.append(label)

        normal_images_lst = []
        normal_labels = []

        for index, row in normal_df.iterrows():
            filename = row['filename']
            label = row['finding']
            path = normal_image_path + filename

            # temporary fix while we preprocess ALL the images
            if filename == '4c268764-b5e5-4417-85a3-da52916984d8.jpg':
                break

            image = cv2.imread(path)
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

            normal_images_lst.append(image)
            normal_labels.append(label)

            # normalize to interval of [0,1]
            covid_images = np.array(covid_images_lst) / 255

            # normalize to interval of [0,1]
            normal_images = np.array(normal_images_lst) / 255
            
            self.covid_images = covid_images
            self.normal_images = normal_images
            self.covid_labels = covid_labels
            self.normal_labels = normal_labels

        return covid_images, normal_images, covid_labels, normal_labels
    
    def plot_images(self, images, title):  
        nrows, ncols = 10, 10
        figsize = [5, 5]

        fig, ax = plt.subplots(nrows=nrows, ncols=ncols, figsize=figsize, facecolor=(1, 1, 1))

        for i, axi in enumerate(ax.flat):
            axi.imshow(images[i])
            axi.set_axis_off()

        plt.suptitle(title, fontsize=24)
        plt.tight_layout(pad=0.2, rect=[0, 0, 1, 0.9])
        plt.show()

    def Split_Train_Test(self, params = 'default'):
        if params == 'default':
            covid_images = self.covid_images; normal_images = self.normal_images 
            covid_labels = self.covid_labels; normal_labels = self.normal_labels; test_ratio = self.test_ratio
            
        # covid_images=92, normal_images=99 >> 191 = 152_Train + 39_Test
        # split into training and testing   # , shuffle = True
        covid_x_train, covid_x_test, covid_y_train, covid_y_test = \
        train_test_split(covid_images, covid_labels, test_size = test_ratio)

        normal_x_train, normal_x_test, normal_y_train, normal_y_test =\
        train_test_split(normal_images, normal_labels, test_size = test_ratio)

        X_train = np.concatenate((normal_x_train, covid_x_train), axis=0)
        X_test = np.concatenate((normal_x_test, covid_x_test), axis=0)
        y_train = np.concatenate((normal_y_train, covid_y_train), axis=0)
        y_test = np.concatenate((normal_y_test, covid_y_test), axis=0)

        # make labels into categories - either 0 or 1
        y_train = LabelBinarizer().fit_transform(y_train)
        y_train = to_categorical(y_train)

        y_test = LabelBinarizer().fit_transform(y_test)
        y_test = to_categorical(y_test)
        
        self.X_train = X_train; self.y_train = y_train
        self.X_test = X_test; self.y_test = y_test
        
        return X_train, y_train, X_test, y_test
    
    def Time_Stamp(self):
        date_time = datetime.datetime.now()

        D = str(date_time.day)
        M = str(date_time.month)
        Y = str(date_time.year)

        h = str(date_time.hour)
        m = str(date_time.minute)
        s = str(date_time.second)

        lst_date = [D, M, Y, h, m, s]
        
        return lst_date
    
    def FileNameUnique(self, prefix = "Grp16_", suffix = '.csv'):
        file_name = prefix

        lst_date = self.Time_Stamp()
        
        for idx, i in enumerate(lst_date):
            if idx == 2:
                file_name += i + '_'
            elif idx == 5:
                file_name += i + suffix
            else:
                file_name += i + '.'

        return file_name
    
    def model_parameters(self):
        list_param_name = ['test_ratio', 'folds', 'batch_size', 'epochs', 'verbose', 'shape', 
                            'activation_optimizer', 'early_stop_criteria',
                            'covid_path', 'covid_image_path', 'normal_path', 'normal_image_path', 'head_count'] 
        
        list_param_values = [self.test_ratio, self.folds, self.batch_size, self.epochs, self.verbose, self.shape, 
                              self.activation_optimizer, self.early_stop_criteria,
               self.covid_path, self.covid_image_path, self.normal_path, self.normal_image_path, self.head_count]
        
        dict_params = {'parameter': list_param_name, 'value': list_param_values}
        df_params = pd.DataFrame(dict_params)

        return df_params
        
    def model_parameters_save(self):
        list_param_name = ['mean_Accuracy_TEST', 'mean_Accuracy_VALIDATION', 'mean_Accuracy_TRAIN', 'Run_Time',
                            'test_ratio', 'folds', 'batch_size', 'epochs', 'verbose', 'shape', 
                            'activation_opt_keys', 'activation_opt_vals', 'early_stop_criteria',
                            'covid_path', 'covid_image_path', 'normal_path', 'normal_image_path', 'head_count'] 
        
        opt_config = opt_config = self.activation_optimizer.get_config()
        list_optimizer_keys = [ k for k in opt_config ]
        list_optimizer_values = [ v for v in opt_config.values() ]

        list_param_values = [self.mean_Accuracy_TEST, self.mean_Accuracy_VALIDATION, self.mean_Accuracy_TRAIN, 
        self.run_time, self.test_ratio, self.folds, self.batch_size, self.epochs, self.verbose, self.shape, 
        list_optimizer_keys, list_optimizer_values, self.early_stop_criteria, self.covid_path, self.covid_image_path, 
        self.normal_path, self.normal_image_path, self.head_count]
        
        dict_params = {'parameter': list_param_name, 'value': list_param_values}
        df_params = pd.DataFrame(dict_params)

        return df_params

    def save_model(self, model_t0_save, file_name):
        save_model(model_t0_save, file_name)
        print('model saved as: {}'.format(file_name))
 
    def load_model(self, file_name):
        loaded_model = load_model(file_name)
        return loaded_model
Esempio n. 2
0
class VPGSolver(StandardAgent):
    """
    A standard vpg_solver, inpired by:
      https://github.com/jachiam/rl-intro/blob/master/pg_cartpole.py
    NOTE: 
        will need to examine steps (total_t), not episodes, as VPG doesn't
        implement episodes per-training-step
    """
    can_graph = True  # batch size is variable, cannot use tf graphing

    def __init__(self, 
        experiment_name, 
        env_wrapper,
        gamma=0.99, 
        epsilon=None,
        epsilon_decay_rate=0.995,
        epsilon_min=0.1,
        batch_size=64,
        n_cycles=128,
        learning_rate=0.01,
        model_name="vpg", 
        saving=True):

        super(VPGSolver, self).__init__(
            env_wrapper,
            model_name,
            experiment_name,
            saving=saving)

        self.label = "Batch"  # not by episode, by arbitrary batch
        self.action_size_tensor = tf.constant(self.action_size)
        
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay_rate = epsilon_decay_rate
        self.epsilon_min = epsilon_min

        # TODO could go to standard..
        self.batch_size = batch_size
        self.n_cycles = n_cycles

        self.memory = []  # state
        self.solved_on = None

        self.model = self.build_model()
        self.optimizer = Adam(lr=learning_rate)  # decay=learning_rate_decay)

        self.load_state()

        # TODO rollout steps

    @staticmethod
    def discount_future_cumsum(episode_rewards, gamma):
        """
        Takes: 
            A list of rewards per step for an episode
        Returns: 
            The future reward at each step, with the future discounting 
            rate applied from that step onwards.
        """
        ep_rwds = np.array(episode_rewards)
        n = len(ep_rwds)
        discounts = gamma ** np.arange(n)
        discounted_futures = np.zeros_like(ep_rwds, dtype=np.float64)
        for j in range(n):
            discounted_futures[j] = sum(ep_rwds[j:] * discounts[:(n-j)])

        assert len(discounted_futures) == len(episode_rewards)
        return discounted_futures

    def solve(self, max_iters, verbose=False, render=False):
        start_time = datetime.datetime.now()
        env = self.env_wrapper.env
        state, done, episode_rewards = env.reset(), False, []
        success_steps = 0

        for batch_num in range(max_iters):
            # Refresh every batch (on-policy)
            state_batch, act_batch, batch_future_rewards = [], [], []

            for step in range(self.n_cycles):
                if render:
                    env.render()

                action = self.act(self.model, state, epsilon=self.epsilon)
                state_next, reward, done, _ = env.step(action)

                # Custom reward if required by env wrapper
                reward = self.env_wrapper.reward_on_step(
                    state, state_next, reward, done, step)
                
                state_batch.append(state.copy())
                act_batch.append(np.int32(action))
                episode_rewards.append(reward)

                # NOTE: Removed copy
                state = state_next

                self.report_step(step, batch_num, max_iters)

                if done:

                    # At the end of each episode:
                    # Create a list of future rewards, 
                    #  discounting by how far in the future
                    batch_future_rewards += list(
                        self.discount_future_cumsum(
                            episode_rewards, self.gamma))
                    self.scores.append(success_steps)
                    state, done, episode_rewards = env.reset(), False, []
                    success_steps = 0
                else:
                    success_steps += 1
            
            # Add any trailing rewards to done
            batch_future_rewards += list(
                self.discount_future_cumsum(
                    episode_rewards, self.gamma)
            )
            episode_rewards = []

            # HANDLE END OF EPISODE
            batch_advs = np.array(batch_future_rewards)

            # This is R(tau), normalised
            normalised_batch_advs = ( 
                (batch_advs - np.mean(batch_advs))
                / (np.std(batch_advs) + 1e-8)
            )

            self.remember(state_batch, act_batch, normalised_batch_advs)
            self.learn(*self.get_batch_to_train())

            solved = self.handle_episode_end(
                state, state_next, reward, 
                step, max_iters, verbose=verbose)

            if solved:
                break
        
        self.elapsed_time += (datetime.datetime.now() - start_time)
        return solved

    def remember(self, state_batch, act_batch, batch_advs):

        self.memory = (state_batch, act_batch, batch_advs)

    def get_batch_to_train(self):

        assert len(self.memory[0]) == len(self.memory[1]), f"{len(self.memory[0])}, {len(self.memory[1])}"
        assert len(self.memory[1]) == len(self.memory[2]), f"{len(self.memory[1])}, {len(self.memory[2])}"

        minibatch_i = np.random.choice(len(self.memory[0]),
            min(self.batch_size, len(self.memory[0])),
            )
        
        sampled_memory = []
        for i in range(len(self.memory)):
            sampled_memory.append(tf.convert_to_tensor([self.memory[i][j] for j in minibatch_i]))

        self.memory = []  # Only learning from last set of trajectories

        return sampled_memory
    
    def learn(self, sts, acts, advs):
        """Updated the agent's decision network based
        on a sample of previous decisions it has seen.
        Here, we combine the target and action networks.
        """

        loss_value = self.take_training_step(sts, acts, advs)

        if self.epsilon:
            if self.epsilon > self.epsilon_min:
                self.epsilon *= self.epsilon_decay_rate

        return loss_value

    @conditional_decorator(tf.function, can_graph)
    def take_training_step(self, sts, acts, advs):
        tf.debugging.assert_equal(tf.shape(sts)[0], tf.size(acts), summarize=1) 
        tf.debugging.assert_equal(tf.size(acts), tf.size(advs), summarize=1)

        with tf.GradientTape() as tape:
            
            # One step away from Pi_theta(at|st)
            pi_action_logits = self.model(sts)
            
            action_one_hots = tf.one_hot(
                acts, self.action_size_tensor, dtype=tf.float64)
            
            # This IS pi_theta(at|st), only at the actual action taken
            pi_action_log_probs = tf.math.reduce_sum(
                action_one_hots * tf.nn.log_softmax(pi_action_logits), 
                axis=1)

            tf.debugging.assert_equal(tf.size(advs), tf.size(pi_action_log_probs))

            loss_value = - tf.math.reduce_mean(
                advs * pi_action_log_probs
            )

        grads = tape.gradient(loss_value, self.model.trainable_variables)

        self.optimizer.apply_gradients(
            zip(grads, self.model.trainable_variables))

        return loss_value

    def save_state(self, add_to_save={}):
        """Save a (trained) model with its weights to a specified file.
        Metadata should be passed to keep information avaialble.
        """

        self.save_state_to_dict(append_dict={
            "optimizer_config": self.optimizer.get_config(),
            "epislon": self.epsilon,
        })

        self.model.save(self.model_location)

    def load_state(self):
        """Load a model with the specified name"""

        model_dict = self.load_state_from_dict()

        print("Loading weights from", self.model_location + "...", end="")
        
        if os.path.exists(self.model_location):
            self.model = tf.keras.models.load_model(self.model_location)

            self.optimizer = self.optimizer.from_config(self.optimizer_config)
            del model_dict["optimizer_config"], self.optimizer_config

            print(" Loaded.")
        
        else:
            print(" Model not yet saved at loaction.")

        if "memory" in model_dict:
            del model_dict["memory"]

        print("Loaded state:")
        pprint.pprint(model_dict, depth=1)
Esempio n. 3
0
class PPOSolver(StandardAgent):
    """
    PPO Solver
    Inspired by:
      https://github.com/anita-hu/TF2-RL/blob/master/PPO/TF2_PPO.py
      https://github.com/ajleite/basic-ppo/blob/master/ppo.py
    """

    can_graph = True

    def __init__(self, 
        experiment_name,
        env_wrapper,
        clip_ratio=0.2,
        val_coef=1.0,
        entropy_coef=0.01,
        lam=1.0,
        gamma=0.95,
        actors=1,
        cycle_length=128,
        minibatch_size_per_actor=64,
        cycle_epochs=4,
        learning_rate=5e-4,
        model_name="ppo",
        saving=True):

        super(PPOSolver, self).__init__(
            env_wrapper,
            model_name,
            experiment_name, 
            saving=saving)

        self.clip_ratio = clip_ratio
        self.gamma = gamma
        self.lam = lam
        self.val_coef = val_coef
        self.entropy_coef = entropy_coef

        self.actors = actors
        self.cycle_length = cycle_length  # Run this many per epoch
        self.batch_size = cycle_length * actors  # Sample from the memory
        self.minibatch_size = minibatch_size_per_actor * actors  # train on batch
        self.cycle_epochs = cycle_epochs  # Train for this many epochs

        # self.num_init_random_rollouts = num_init_random_rollouts
        self.model_name = model_name

        self.solved_on = None

        self.model = PPOModel(
            self.state_size, self.action_size, model_name=self.model_name)
        self.model.build(input_shape=(None, self.state_size))

        # self._random_dataset = self._gather_rollouts(
        #     env_wrapper, num_init_random_rollouts, epsilon=1.)

        self.optimizer = Adam(lr=learning_rate)

        head, _, _ = self.model_location.rpartition(".h5")
        self.model_location = head + ".weights"
        self.load_state()

    def show(self, render=False):
        raise NotImplementedError("self.model needs to be adapted in super")

    def solve(self, max_iters, verbose=False, render=False):
        start_time = datetime.datetime.now()
        env_trackers = [EnvTracker(self.env_wrapper) for _ in range(self.actors)]
        solved = False

        # Every episode return ever
        all_episode_returns = []
        all_episode_steps = []

        for iteration in range(max_iters):
            data = []  # Refresh every batch (on-policy)

            for env_tracker in env_trackers:
                state = env_tracker.latest_state
                states, actions, log_probs, rewards, v_preds =\
                    [], [], [], [], []

                for step in range(self.cycle_length):
                    if render:
                        env_tracker.env.render()

                    action, value, log_prob = (
                        tf.squeeze(x).numpy() for x in
                        self.model.act_value_logprobs(
                            state, 
                            eps=None)
                    )
                    observation, reward, done, _ = env_tracker.env.step(action)
                    state_next = observation

                    # Custom reward if required by env wrapper
                    reward = self.env_wrapper.reward_on_step(
                        state, state_next, reward, done, step)

                    env_tracker.return_so_far += reward

                    states.append(state)
                    actions.append(action)
                    log_probs.append(log_prob)
                    rewards.append(np.float64(reward))
                    v_preds.append(value)

                    self.report_step(step, iteration, max_iters)
                    if done:
                        all_episode_returns.append(
                            env_tracker.return_so_far)
                        all_episode_steps.append(env_tracker.steps_so_far)
                        state = env_tracker.env.reset()
                        env_tracker.steps_so_far = 0
                        env_tracker.return_so_far = 0.
                    else:
                        env_tracker.steps_so_far += 1
                        state = observation

                next_v_preds = v_preds[1:] + [0.]  # TODO - both right float?
                gaes = self.get_norm_general_advantage_est(
                    rewards, v_preds, next_v_preds)

                # TODO make a handler object
                if not data:
                    data = [
                        states, actions, log_probs, next_v_preds, rewards, 
                        gaes
                    ]
                else:
                    data[0] += states; data[1] += actions; data[2] += log_probs
                    data[3] += next_v_preds; data[4] += rewards; data[5] += gaes

                env_tracker.latest_state = state

            self.scores = all_episode_steps  # FIXME this won't handle picking up from left-off
            solved = self.handle_episode_end(
                state, state_next, reward, 
                step, max_iters, verbose=verbose)
            if solved: 
                break

            self.take_training_step(
                *(tf.convert_to_tensor(lst) for lst in data)
                # *tuple(map(tf.convert_to_tensor, zip(*memory)))
            )
        self.elapsed_time += (datetime.datetime.now() - start_time)
        return solved

    def get_norm_general_advantage_est(self, rewards, v_preds, next_v_preds):
        # Sources:
        #  https://github.com/uidilr/ppo_tf/blob/master/ppo.py#L98
        #  https://github.com/anita-hu/TF2-RL/blob/master/PPO/TF2_PPO.py
        deltas = [
            r_t + self.gamma * v_next - v for r_t, v_next, v in 
            zip(rewards, next_v_preds, v_preds)
        ]
        gaes = copy.deepcopy(deltas)
        for t in reversed(range(len(gaes) - 1)):
            gaes[t] = gaes[t] + self.lam * self.gamma * gaes[t + 1]

        gaes = np.array(gaes).astype(np.float64)
        norm_gaes = (gaes - gaes.mean()) / gaes.std()

        return norm_gaes

    @conditional_decorator(tf.function, can_graph)
    def take_training_step(self, sts, a, log_p, nxt_v_pred, r, adv):
        """
        Performs gradient DEscent on minibatches of minibatch_size, 
        sampled from a batch of batch_size, sampled from the memory

        Samples without replacement (to check)
        """

        assert self.batch_size == len(r)

        for _ in range(self.cycle_epochs):
            # Batch from the examples in the memory
            shuffled_indices = tf.random.shuffle(tf.range(self.batch_size))  # Every index of the cycle examples
            num_mb = self.batch_size // self.minibatch_size
            # Pick minibatch-sized samples from there
            for minibatch_i in tf.split(shuffled_indices, num_mb):
                minibatch = (
                    tf.gather(x, minibatch_i, axis=0) 
                    for x in (sts, a, log_p, nxt_v_pred, r, adv)
                )
                self.train_minibatch(*minibatch)

        # TODO used to be zip weights and assign
        # for pi_old_w, pi_w in zip(
        #         self.pi_model_old.weights, self.pi_model.weights):
        #     pi_old_w.assign(pi_w)
    
    @conditional_decorator(tf.function, can_graph)
    def train_minibatch(self, sts, a, log_p, nxt_v_pred, r, adv):
       
        # Convert from (64,) to (64, 1)
        r = tf.expand_dims(r, axis=-1)
        nxt_v_pred = tf.expand_dims(nxt_v_pred, axis=-1)

        with tf.GradientTape() as tape:
            new_log_p, entropy, sts_vals = self.model.evaluate_actions(sts, a)
            ratios = tf.exp(new_log_p - log_p)

            clipped_ratios = tf.clip_by_value(
                ratios, 
                clip_value_min=1-self.clip_ratio, 
                clip_value_max=1+self.clip_ratio
            )
            loss_clip = tf.reduce_mean(
                tf.minimum((adv  * ratios), (adv * clipped_ratios))
            )
            target_values = r + self.gamma * nxt_v_pred

            vf_loss = tf.reduce_mean(
                tf.math.square(sts_vals - target_values)
            )

            entropy = tf.reduce_mean(entropy)

            total_loss = ( 
                - loss_clip 
                + self.val_coef * vf_loss 
                - self.entropy_coef * entropy
            )
        train_variables = self.model.trainable_variables
        grads = tape.gradient(total_loss, train_variables)
        self.optimizer.apply_gradients(zip(grads, train_variables))

    def save_state(self, verbose=False):
        """
        Called at the end of saving-episodes.

        Save a (trained) model with its weights to a specified file.
        Passes the required information to add to the pickle dict for the 
         model.
        """

        add_to_save = {
            # "epsilon": self.epsilon,
            # "memory": self.memory,
            "optimizer_config": self.optimizer.get_config(),
            }

        self.save_state_to_dict(append_dict=add_to_save)

        if verbose:
            print("Saving to", self.model_location)

        self.model.save_weights(self.model_location) # , save_format='tf')

    def load_state(self):
        """Load a model with the specified name"""

        model_dict = self.load_state_from_dict()

        print("Loading weights from", self.model_location + "...", end="")
        if os.path.exists(self.model_location):
            self.model.load_weights(self.model_location)
            self.optimizer = self.optimizer.from_config(self.optimizer_config)
            del model_dict["optimizer_config"], self.optimizer_config
            print(" Loaded.")
        else:
            print(" Model not yet saved at loaction.")

        if "memory" in model_dict:
            del model_dict["memory"]

        print("Loaded state:")
        pprint.pprint(model_dict, depth=1)
Esempio n. 4
0
class DDPGSolver(StandardAgent):
    """
    A standard ddpg solver:
      https://github.com/openai/baselines/blob/master/baselines/a2c/a2c.py
    Inspired by
      https://github.com/anita-hu/TF2-RL/blob/master/DDPG/TF2_DDPG_Basic.py
    """
    def __init__(
        self,
        experiment_name,
        env_wrapper,
        ent_coef=1e-4,
        vf_coef=0.5,
        n_cycles=128,
        batch_size=64,
        max_grad_norm=0.5,
        learning_rate_actor=1e-5,
        learning_rate_critic=1e-3,
        memory_len=100000,
        gamma=0.99,
        epsilon=None,
        tau=0.125,
        lrschedule='linear',
        model_name="ddpg",
        saving=True,
        rollout_steps=5000,
    ):

        super(DDPGSolver, self).__init__(env_wrapper,
                                         model_name,
                                         experiment_name,
                                         saving=saving)

        self.n_cycles = n_cycles
        self.batch_size = batch_size

        self.gamma = gamma
        self.ent_coef = ent_coef
        self.vf_coef = vf_coef

        # NOTE new AND need to verify deque is safe
        self.memory = deque(maxlen=memory_len)
        self.epsilon = epsilon  # new but should be in A2C
        self.tau = tau

        # TODO reimplement
        # self.max_grad_norm = max_grad_norm
        # self.epsilon = epsilon  # exploration rate

        self.solved_on = None

        self.actor = self.build_model(model_name=model_name + "_actor")
        self.actor.build(input_shape=(
            None,
            self.state_size,
        ))

        self.actor_dash = self.build_model(model_name=model_name +
                                           "_actor_target")
        self.actor_dash.build(input_shape=(
            None,
            self.state_size,
        ))

        self.actor_dash.set_weights(self.actor.get_weights())

        self.actor_optimizer = Adam(learning_rate=learning_rate_actor)
        self.actor.summary()

        self.critic = self.build_critic_model(self.state_size,
                                              self.action_size,
                                              model_name=model_name +
                                              "_critic")
        # self.critic.build(input_shape=[(state_size,), (action_size,)])
        self.critic_dash = self.build_critic_model(self.state_size,
                                                   self.action_size,
                                                   model_name=model_name +
                                                   "_critic_target")
        # self.critic_dash.build(input_shape=[(state_size,), (action_size,)])

        self.critic_dash.set_weights(self.critic.get_weights())

        self.critic_optimizer = Adam(learning_rate=learning_rate_critic)
        self.critic.summary()

        self.load_state()

        self.rollout_memory(rollout_steps - len(self.memory))

    def build_critic_model(self, input_size, action_size, model_name='critic'):
        """
        Returns Q(st+1 | a, s)
        """

        inputs = [Input(shape=(input_size)), Input(shape=(action_size, ))]
        concat = Concatenate(axis=-1)(inputs)
        x = Dense(24, name="hidden_1", activation='tanh')(concat)
        x = Dense(48, name="hidden_2", activation='tanh')(x)
        output = Dense(1, name="Out")(x)
        model = Model(inputs=inputs, outputs=output, name=model_name)
        model.build(input_shape=[(input_size, ), (action_size, )])

        return model

    def act_with_noise(self, state, add_noise=True):
        raise NotImplementedError(
            "Consider implementing from\nhttps://github.com/anita-hu/"
            "TF2-RL/blob/master/DDPG/TF2_DDPG_Basic.py")

    def show(self, render=False):
        raise NotImplementedError("self.model needs to be adapted in super")

    def solve(self, max_iters, verbose=False, render=False):
        start_time = datetime.datetime.now()
        env = self.env_wrapper.env
        state = env.reset()

        success_steps = 0

        for iteration in range(max_iters):
            for step in range(self.n_cycles):  # itertools.count():
                if render:
                    env.render()

                # TODO implement act and add noise
                action_dist = self.actor(tf.expand_dims(state, axis=0))
                observation, reward, done, _ = env.step(np.argmax(action_dist))

                # Custom reward if required by env wrapper
                reward = self.env_wrapper.reward_on_step(
                    state, observation, reward, done, step)

                self.memory.append((state, tf.squeeze(action_dist),
                                    np.float64(reward), observation, done))
                state = observation

                self.report_step(step, iteration, max_iters)

                if done:
                    # OR env_wrapper.get_score(state, observation, reward, step)
                    self.scores.append(success_steps)
                    success_steps = 0
                    state = env.reset()
                else:
                    success_steps += 1

                self.take_training_step()

            solved = self.handle_episode_end(state,
                                             observation,
                                             reward,
                                             step,
                                             max_iters,
                                             verbose=verbose)

            if solved:
                break

        self.elapsed_time += (datetime.datetime.now() - start_time)
        return solved

    def take_training_step(self):
        if len(self.memory) < self.batch_size:
            return

        # Note min is actually unecessary with cond above
        minibatch_i = np.random.choice(
            len(self.memory),
            min(self.batch_size, len(self.memory)),
        )

        minibatch = [self.memory[i] for i in minibatch_i]

        # Obs on [adv, return]
        loss_value = self.train_on_minibatch(
            *tuple(map(tf.convert_to_tensor, zip(*minibatch))))

        # Update weights
        for model_name in "actor", "critic":
            self.update_weights(model_name, self.tau)

        # TODO decrease epsilon if not None

    @tf.function()
    def train_on_minibatch(self, sts, a, r, n_sts, d):

        # r + gam(1-d)Q_phi_targ(s_t+1, mu_theta_targ(s_t+1))
        n_a = self.actor_dash(n_sts)
        q_future_pred = self.critic_dash([n_sts, n_a])
        target_qs = r + tf.where(
            d, tf.zeros(shape=q_future_pred.shape, dtype=tf.dtypes.float64),
            self.gamma * q_future_pred)

        # Minimise (r + target on next state) - (current critic on sts and a)
        # Makes critic better at predicting future
        with tf.GradientTape() as tape:
            updated_q_values = self.critic([sts, a])
            critic_loss = tf.reduce_mean(
                tf.math.square(updated_q_values - target_qs))

        critic_grad = tape.gradient(critic_loss,
                                    self.critic.trainable_variables)
        self.critic_optimizer.apply_gradients(
            zip(critic_grad, self.critic.trainable_variables))

        # Use the (improving) critic to rate the actor's updated decision
        # Minimising loss means maximising actor's expectation
        with tf.GradientTape() as tape:
            # mu_phi(s)
            updated_action_dist = self.actor(sts)
            # Works due to chain rule, tracks mu gradients to improve mu prediciton
            # TODO this is quite nuanced - check this through
            actor_loss = -tf.reduce_mean(
                self.critic([sts, updated_action_dist]))

        actor_grad = tape.gradient(actor_loss, self.actor.trainable_variables)
        self.actor_optimizer.apply_gradients(
            zip(actor_grad, self.actor.trainable_variables))

    def update_weights(self, model_name, tau):
        weights = getattr(getattr(self, model_name), "weights")
        target_model = getattr(self, model_name + "_dash")
        target_weights = target_model.weights
        target_model.set_weights([
            weights[i] * tau + target_weights[i] * (1. - tau)
            for i in range(len(weights))
        ])

    def save_state(self):
        """
        Called at the end of saving-episodes.

        Save a (trained) model with its weights to a specified file.
        Passes the required information to add to the pickle dict for the 
         model.
        """

        add_to_save = {
            "memory": self.memory,
            "epsilon": self.epsilon,
            "actor_optimizer_config": self.actor_optimizer.get_config(),
            "critic_optimizer_config": self.critic_optimizer.get_config(),
        }

        self.save_state_to_dict(append_dict=add_to_save)

        for var in ("actor", "actor_dash", "critic", "critic_dash"):
            model = getattr(self, var)
            model.save_weights(
                self.model_location.replace(".h5", "_" + var + ".h5"))

    def load_state(self):
        """Load a model with the specified name"""

        model_dict = self.load_state_from_dict()

        print("Loading weights from", self.model_location + "...", end="")
        if os.path.exists(self.model_location):
            for var in ("actor", "actor_dash", "critic", "critic_dash"):
                model = getattr(self, var)
                self.model.load_weights(
                    self.model_location.replace(".h5", "_" + var + ".h5"))
            self.actor_optimizer = self.actor_optimizer.from_config(
                self.actor_optimizer_config)
            self.critic_optimizer = self.critic_optimizer.from_config(
                self.critic_optimizer_config)
            del model_dict[
                "actor_optimizer_config"], self.actor_optimizer_config
            del model_dict[
                "critic_optimizer_config"], self.critic_optimizer_config
            print(" Loaded.")
        else:
            print(" Model not yet saved at loaction.")

        if "memory" in model_dict:
            del model_dict["memory"]

        print("Loaded state:")
        pprint.pprint(model_dict, depth=1)

    def rollout_memory(self, rollout_steps, render=False):
        if rollout_steps <= 0:
            return
        print("Rolling out steps", rollout_steps)
        env = self.env_wrapper.env
        state = env.reset()

        max_iters = rollout_steps // self.n_cycles

        for iteration in range(max_iters):
            for step in range(self.n_cycles):
                if render:
                    env.render()

                # TODO implement act and add noise
                action_dist = self.actor(tf.expand_dims(state, axis=0))
                observation, reward, done, _ = env.step(np.argmax(action_dist))

                # Custom reward if required by env wrapper
                reward = self.env_wrapper.reward_on_step(
                    state, observation, reward, done, step)

                self.memory.append((state, tf.squeeze(action_dist),
                                    np.float64(reward), observation, done))
                state = observation

                self.report_step(step, iteration, max_iters)

                if done:
                    state = env.reset()

        print("\nCompleted.")
Esempio n. 5
0
class DQNSolver(StandardAgent):
    """
    A standard dqn_solver, inpired by:
      https://gym.openai.com/evaluations/eval_EIcM1ZBnQW2LBaFN6FY65g/
    Implements a simple DNN that predicts values.
    """
    def __init__(self,
                 experiment_name,
                 env_wrapper,
                 memory_len=100000,
                 gamma=0.99,
                 batch_size=64,
                 n_cycles=128,
                 epsilon=1.,
                 epsilon_min=0.01,
                 epsilon_decay=0.995,
                 learning_rate=0.01,
                 learning_rate_decay=0.01,
                 rollout_steps=10000,
                 model_name="dqn",
                 saving=True):

        super(DQNSolver, self).__init__(env_wrapper,
                                        model_name,
                                        experiment_name,
                                        saving=saving)

        # Training
        self.batch_size = batch_size
        self.n_cycles = n_cycles

        self.memory = deque(maxlen=memory_len)
        self.solved_on = None

        self.gamma = gamma  # discount rate was 1
        self.epsilon = epsilon  # exploration rate
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay  # 0.995

        self.model = self.build_model()

        self.optimizer = Adam(lr=learning_rate, decay=learning_rate_decay)

        self.load_state()

        self.rollout_memory(rollout_steps - len(self.memory))

    def rollout_memory(self, rollout_steps, verbose=False, render=False):
        if rollout_steps <= 0:
            return
        env = self.env_wrapper.env
        state = env.reset()
        for step in range(rollout_steps):
            if render:
                env.render()

            action = self.act(self.model, state, epsilon=1.)  # Max random
            observation, reward, done, _ = env.step(action)
            state_next = observation

            # Custom reward if required by env wrapper
            reward = self.env_wrapper.reward_on_step(state, state_next, reward,
                                                     done, step)

            self.memory.append(
                (state, np.int32(action), reward, state_next, done))
            state = observation

            if done:
                state = env.reset()
                # OR env_wrapper.get_score(state, state_next, reward, step)
        print(f"Rolled out {len(self.memory)}")

    def solve(self, max_iters, verbose=False, render=False):
        start_time = datetime.datetime.now()
        env = self.env_wrapper.env
        state = env.reset()
        success_steps = 0

        for iteration in range(max_iters):
            for step in range(self.n_cycles):
                if render:
                    env.render()

                action = self.act(self.model, state, epsilon=self.epsilon)
                observation, reward, done, _ = env.step(action)
                state_next = observation

                # Custom reward if required by env wrapper
                reward = self.env_wrapper.reward_on_step(
                    state, state_next, reward, done, step)

                self.memory.append(
                    (state, np.int32(action), reward, state_next, done))
                state = observation

                self.report_step(step, iteration, max_iters)
                if done:
                    state = env.reset()
                    # OR env_wrapper.get_score(state, state_next, reward, step)
                    self.scores.append(success_steps)
                    success_steps = 0
                else:
                    success_steps += 1

                self.learn()

            score = step

            solved = self.handle_episode_end(state,
                                             state_next,
                                             reward,
                                             step,
                                             max_iters,
                                             verbose=verbose)

            if solved:
                break

        self.elapsed_time += (datetime.datetime.now() - start_time)
        return solved

    def learn(self):
        """
        Updated the agent's decision network based
        on a sample of previous decisions it has seen.
        Here, we combine the target and action networks.
        """
        if len(self.memory) < self.batch_size:
            return

        args_as_tuple = get_batch_from_memory(self.memory, self.batch_size)

        loss_value = self.take_training_step(*args_as_tuple)

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    @tf.function
    def take_training_step(self, sts, a, r, n_sts, d):

        future_q_pred = tf.math.reduce_max(self.model(n_sts), axis=-1)
        future_q_pred = tf.where(d, tf.zeros((1, ), dtype=tf.dtypes.float64),
                                 future_q_pred)
        q_targets = tf.cast(r, tf.float64) + self.gamma * future_q_pred

        loss_value, grads = self.squared_diff_loss_at_a(sts, a, q_targets)

        self.optimizer.apply_gradients(
            zip(grads, self.model.trainable_variables))

        return loss_value

    @tf.function
    def squared_diff_loss_at_a(self, sts, a, q_next):
        """
        A squared difference loss function 
        Diffs the Q model's predicted values for a state with 
        the actual reward + predicted values for the next state
        """
        with tf.GradientTape() as tape:
            q_s = self.model(sts)  # Q(st)
            # Take only predicted value of the action taken for Q(st|at)
            gather_indices = tf.range(a.shape[0]) * tf.shape(q_s)[-1] + a
            q_s_a = tf.gather(tf.reshape(q_s, [-1]), gather_indices)

            # Q(st|at) diff Q(st+1)
            losses = tf.math.squared_difference(q_s_a, q_next)
            reduced_loss = tf.math.reduce_mean(losses)

        return (reduced_loss,
                tape.gradient(reduced_loss, self.model.trainable_variables))

    def save_state(self):
        """
        Called at the end of saving-episodes.

        Save a (trained) model with its weights to a specified file.
        Passes the required information to add to the pickle dict for the 
         model.
        """

        add_to_save = {
            "epsilon": self.epsilon,
            "memory": self.memory,
            "optimizer_config": self.optimizer.get_config(),
        }

        self.save_state_to_dict(append_dict=add_to_save)

        self.model.save(self.model_location)

    def load_state(self):
        """Load a model with the specified name"""

        model_dict = self.load_state_from_dict()

        print("Loading weights from", self.model_location + "...", end="")
        if os.path.exists(self.model_location):
            self.model = tf.keras.models.load_model(self.model_location)
            self.optimizer = self.optimizer.from_config(self.optimizer_config)
            del model_dict["optimizer_config"], self.optimizer_config
            print(" Loaded.")
        else:
            print(" Model not yet saved at loaction.")

        if "memory" in model_dict:
            del model_dict["memory"]

        print("Loaded state:")
        pprint.pprint(model_dict, depth=1)