Example #1
0
    def __init__(self,
                 thread_index,
                 global_network,
                 initial_learning_rate,
                 learning_rate,
                 grad_applier,
                 show_env=False,
                 local_t_max=20,
                 max_global_time_step=10 * 10**7,
                 gamma=0.99,
                 save_interval_step=100 * 1000,
                 env='Breakout-v0',
                 device='/cpu:0'):

        self.thread_index = thread_index
        self.learning_rate = learning_rate
        self.env = env

        # Whether to render the environment
        # or not during training (default is
        # True for one of the agents) - change
        # this in main.py
        self.show_env = show_env

        # Discount factor for the reward
        self.gamma = gamma

        # Number of "epochs"
        self.max_global_time_step = max_global_time_step

        # Number of steps for the LSTM
        self.local_t_max = local_t_max

        # Number of actions the agent can take
        self.action_size = Environment.get_action_size(env)

        self.local_network = A3C(self.action_size, self.thread_index, device)

        self.global_network = global_network

        # Build computational graph
        self.local_network._create_network()

        # Build computational graph for the losses
        # and gradients
        self.local_network.prepare_a3c_loss()
        self.apply_gradients = grad_applier.minimize_local(
            self.local_network.a3c_loss, global_network.get_vars(),
            self.local_network.get_vars())

        # Sync the weights of the local network with those
        # of the main network
        self.sync = self.local_network.sync_from(global_network)

        # Initialize time step, learning rate, etc
        self.local_t = 0
        self.initial_learning_rate = initial_learning_rate
        self.episode_reward = 0
Example #2
0
    def run(self):
        """ Run the model """

        with tf.device(self.device):
            # The learning rate is sampled from a
            # log-uniform distribution between
            # 0.0001 and 0.005. Then, it is
            # decayed linearly to 0 progressively
            # during training
            initial_learning_rate = log_uniform(self.initial_alpha_low,
                                                self.initial_alpha_high,
                                                0.5)

            # Whether to terminate, pause or keep training
            self.stop = False
            self.terminate = False

            # Initialize global time step
            self.global_t = 0

            # Number of actions the agent can take
            action_size = Environment.get_action_size(self.env)

            # Initialize the shared/global network
            self.global_network = A3C(action_size,
                                      thread_index=-1,
                                      device=self.device)

            # Build computational graph
            self.global_network._create_network()

            # Placeholder for the Trainers
            self.trainers = []

            learning_rate_input = tf.placeholder("float")

            # Initialize the RMSPROP object for the updates
            grad_applier = RMSPropApplier(learning_rate_input)

            # Build the agents
            for i in range(self.parallel_size):
                trainer = Trainer(thread_index=i,
                                  global_network=self.global_network,
                                  initial_learning_rate=initial_learning_rate,
                                  grad_applier=grad_applier,
                                  learning_rate=learning_rate_input)
                if i == 0:
                    trainer.show_env = True

                self.trainers.append(trainer)

            # Initialize Session
            self.sess = tf.Session()
            self.sess.run(tf.global_variables_initializer())

            # Params for logging scores in Tensorboard
            self.score_input = tf.placeholder(tf.int32)
            tf.summary.scalar("score", self.score_input)
            self.summary_op = tf.summary.merge_all()

            # sess.graph contains the graph definition;
            # that enables the Graph Visualizer. To start
            # Tensorboard run the following command:
            # $ tensorboard --logdir=path/to/LOG_FILE
            self.summary_writer = tf.summary.FileWriter(LOG_FILE,
                                                        graph=self.sess.graph)


            # Parameters for saving the global network params
            self.saver = tf.train.Saver(var_list=self.global_network.get_vars(),
                                        max_to_keep=1)

            # Set next checkpoint
            self.next_checkpoint = self.checkpoint_interval

            # Set next log point
            self.next_log = self.logging_interval

            # -----------
            # RUN THREADS
            # -----------

            self.train_threads = []
            for i in range(self.parallel_size):
                self.train_threads.append(threading.Thread(target=self.train,
                                                           args=(i, True)))
            for t in self.train_threads:
                t.start()