Beispiel #1
0
 def _agent(self, cfg):
     """return a TFDqnAgent"""
     net = self._net(cfg["network"])
     optimizer = cfg["optimizer"]["optimizer"](
         learning_rate=cfg["optimizer"]["learning_rate"])
     loss_fn = cfg["optimizer"]["loss_fn"]
     if cfg["type"].lower() == "dqn":
         return dqn_agent.DqnAgent(
             self.env.time_step_spec(),
             self.env.action_spec(),
             q_network=net,
             optimizer=optimizer,
             td_errors_loss_fn=loss_fn,
             target_update_tau=cfg["target"]["soft"],
             target_update_period=cfg["target"]["period"],
             gamma=cfg["gamma"],
             reward_scale_factor=1.0,
             train_step_counter=tf.Variable(0))
     elif cfg["type"].lower() == "ddqn":
         return dqn_agent.DdqnAgent(
             self.env.time_step_spec(),
             self.env.action_spec(),
             q_network=net,
             optimizer=optimizer,
             td_errors_loss_fn=loss_fn,
             target_update_tau=cfg["target"]["soft"],
             target_update_period=cfg["target"]["period"],
             gamma=cfg["gamma"],
             reward_scale_factor=1.0,
             train_step_counter=tf.Variable(0))
     else:
         raise ValueError("Unknown type of agent! Input type: {}".format(
             cfg["type"]))
Beispiel #2
0
        def init_agent():
            """ a DQN agent is set by default in the application"""
            # get the global step
            global_step = tf.compat.v1.train.get_or_create_global_step()

            # TODO: update this to get the optimizer from tensorflow 2.0 if possible
            optimizer = tf.compat.v1.train.AdamOptimizer(
                learning_rate=learning_rate)

            q_net = q_network.QNetwork(self._rl_app.observation_spec,
                                       self._rl_app.action_spec,
                                       fc_layer_params=fc_layer_params)
            time_step_spec = ts.time_step_spec(self._rl_app.observation_spec)
            tf_agent = dqn_agent.DdqnAgent(
                time_step_spec,
                self._rl_app.action_spec,
                q_network=q_net,
                optimizer=optimizer,
                epsilon_greedy=eps_final,
                gradient_clipping=gradient_clipping,
                td_errors_loss_fn=common.element_wise_squared_loss,
                train_step_counter=global_step,
                debug_summaries=True,
                summarize_grads_and_vars=True)
            tf_agent.initialize()
            logger.info("tf_agent initialization is complete")

            # Optimize by wrapping some of the code in a graph using TF function.
            tf_agent.train = common.function(tf_agent.train)

            return tf_agent
Beispiel #3
0
def create_agent(observation_spec, action_spec, time_step_spec, step_counter,
                 use_double_q=False):
  """Creates a DQN/DQRNN agent."""
  train_sequence_length = FLAGS.train_sequence_length
  if train_sequence_length > 1:
    q_net = q_rnn_network.RnnNetwork(
        observation_spec,
        action_spec,
        input_fc_layer_params=parse_str_flag(FLAGS.input_fc_layers),
        cell_type=FLAGS.network_type,
        hidden_size=parse_str_flag(FLAGS.hidden_sizes),
        output_fc_layer_params=parse_str_flag(FLAGS.output_fc_layers))
  else:
    q_net = q_network.QNetwork(
        observation_spec,
        action_spec,
        fc_layer_params=parse_str_flag(FLAGS.fc_layers))
    train_sequence_length = FLAGS.n_step_update

  if FLAGS.use_double_q:
    tf_agent = dqn_agent.DdqnAgent(
        time_step_spec,
        action_spec,
        q_network=q_net,
        epsilon_greedy=FLAGS.epsilon_greedy,
        n_step_update=FLAGS.n_step_update,
        boltzmann_temperature=FLAGS.boltzmann_temperature,
        target_update_tau=FLAGS.target_update_tau,
        target_update_period=FLAGS.target_update_period,
        optimizer=tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate),
        td_errors_loss_fn=common.element_wise_squared_loss,
        gamma=FLAGS.gamma,
        gradient_clipping=FLAGS.gradient_clipping,
        debug_summaries=FLAGS.debug_summaries,
        summarize_grads_and_vars=FLAGS.summarize_grads_and_vars,
        train_step_counter=step_counter)
  else:
    tf_agent = dqn_agent.DqnAgent(
        time_step_spec,
        action_spec,
        q_network=q_net,
        epsilon_greedy=FLAGS.epsilon_greedy,
        n_step_update=FLAGS.n_step_update,
        boltzmann_temperature=FLAGS.boltzmann_temperature,
        target_update_tau=FLAGS.target_update_tau,
        target_update_period=FLAGS.target_update_period,
        optimizer=tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate),
        td_errors_loss_fn=common.element_wise_squared_loss,
        gamma=FLAGS.gamma,
        gradient_clipping=FLAGS.gradient_clipping,
        debug_summaries=FLAGS.debug_summaries,
        summarize_grads_and_vars=FLAGS.summarize_grads_and_vars,
        train_step_counter=step_counter)
  return tf_agent, train_sequence_length
    def create_tf_ddqn_agent(self, q_network, alpha, gamma, epsilon, init_temp,
                             cooldown_time):
        optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=alpha)
        train_step_counter = tf.Variable(0)
        _temp = tf.Variable(np.float(init_temp)) - tf.dtypes.cast(
            train_step_counter, tf.float32) * tf.Variable(
                (init_temp - 1) / (STEP_ITERATIONS * cooldown_time))

        if epsilon is not None:
            _temp = None

        return dqn_agent.DdqnAgent(self.tf_training_env.time_step_spec(),
                                   self.tf_training_env.action_spec(),
                                   q_network=q_network,
                                   optimizer=optimizer,
                                   gamma=gamma,
                                   epsilon_greedy=epsilon,
                                   boltzmann_temperature=_temp,
                                   td_errors_loss_fn=element_wise_huber_loss,
                                   train_step_counter=train_step_counter,
                                   gradient_clipping=10.0)
Beispiel #5
0
    def create_agent(self):
        # a deep neural network to learn Q(s,a)
        q_net = q_network.QNetwork(
            self._train_env.observation_spec(),
            self._train_env.action_spec(),
            #conv_layer_params= param.QNET_CONV_LAYERS,
            fc_layer_params=param.QNET_FC_LAYERS)

        # optional copunter that increments every time the train op is run
        self._train_step = tf.Variable(0)

        # an adaptive learning rate for gradient descent
        optimizer = tf.keras.optimizers.Adam(lr=param.ADAM_LR,
                                             epsilon=param.ADAM_EPSILON)

        # probability of exploration as a function of time steps
        epsilon_fn = tf.keras.optimizers.schedules.PolynomialDecay(
            initial_learning_rate=param.DECAY_LR_INIT,
            decay_steps=param.DECAY_STEPS // param.DECAY_UPDATE_PERIOD,
            end_learning_rate=param.DECAY_LR_END)

        # create the double deep Q learning network agent
        self._agent = dqn_agent.DdqnAgent(
            self._train_env.time_step_spec(),
            self._train_env.action_spec(),
            q_network=q_net,
            optimizer=optimizer,
            # period for soft update of the target networks
            target_update_period=param.AGENT_UPDATE_PERIOD,
            # loss function for gradient descent
            td_errors_loss_fn=tf.keras.losses.Huber(reduction="none"),
            # a discount factor for future rewards.
            gamma=param.AGENT_GAMMA,
            # optional copunter that increments every time the train op is run
            train_step_counter=self._train_step,
            epsilon_greedy=lambda: epsilon_fn(self._train_step))
        self._agent.initialize()
Beispiel #6
0
class TrainAndSaveModel(network.Network):

    #
    #HYPERPARAMETERS
    #
    num_iterations = 200000  #number of batches in an epoch(a single passthrough of a dataset)
    #
    initial_collect_steps = 1500
    collect_steps_per_iteration = 1
    replay_buffer_capacity = 250000
    #
    batch_size = 100  #number of training examples before updating model
    learning_rate = 0.000075  #a measure of how resistant a model is to change (important)
    log_interval = 500  #for printing progress during training
    #
    num_eval_episodes = 15
    eval_interval = 1000  #for deciding when to add a data point of progress
    #
    epsilon = 0.07  #probability of choosing a random action to avoid over/under fitting of model
    gamma = 1.0  #dicount factor for future rewards
    name = "BlackjackSavant"

    #END OF HYPERPARAMETERS

    #
    #HELPER METHODS
    #
    #records the data that results from executing the specified policy in the environment into the buffer
    #def collect_step(env, policy, buffer):

    #record data over specified number of steps
    def collect_data(env, policy, buffer, steps):
        for _ in range(steps):
            time_step = env.current_time_step()
            action_step = policy.action(time_step)
            next_time_step = env.step(action_step.action)
            traject = trajectory.from_transition(time_step, action_step,
                                                 next_time_step)

            buffer.add_batch(traject)

    #average the reward gained by the policy
    def avg_return(env, policy, num_episodes=10):

        total_return = 0.0
        for _ in range(num_episodes):
            time_step = env.reset()
            episode_return = 0.0

            while (not time_step.is_last()):
                action_step = policy.action(time_step)
                time_step = env.step(action_step.action)
                episode_return += time_step.reward

            total_return += episode_return

        avg_return = total_return / num_episodes
        return avg_return.numpy()[0]

    #END OF HELPER METHODS

    #
    #MAIN EXECUTION
    #

    #get start time
    start_time = time.time()

    #initialize environment and wrap it in tf environment
    env = benv.BlackjackEnv()
    tf_env = tf_py_environment.TFPyEnvironment(env)

    #initialize network environment
    network = q_network.QNetwork(tf_env.observation_spec(),
                                 tf_env.action_spec(),
                                 fc_layer_params=(100, ))

    #initialize the agent with the listed parameters
    agent = dqn_agent.DdqnAgent(
        tf_env.time_step_spec(),
        tf_env.action_spec(),
        q_network=network,
        optimizer=tf.optimizers.Adam(learning_rate=learning_rate),
        td_errors_loss_fn=common.element_wise_squared_loss,
        train_step_counter=tf.Variable(0),
        epsilon_greedy=epsilon,
        gamma=gamma,
        name=name)
    agent.initialize()

    #create replay buffer to keep track of training
    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        agent.collect_data_spec,
        batch_size=tf_env.batch_size,
        max_length=replay_buffer_capacity)

    #add an observer to add to the buffer
    replay_observer = [replay_buffer.add_batch]

    #create step driver
    #collect_op = dynamic_step_driver.DynamicStepDriver(tf_env, agent.collect_policy, observers  = replay_observer, num_steps = 10).run()

    #create random policy to help generate dataset
    random_policy = random_tf_policy.RandomTFPolicy(tf_env.time_step_spec(),
                                                    tf_env.action_spec())

    #populate replay buffer
    collect_data(tf_env, random_policy, replay_buffer, initial_collect_steps)

    #generate trajectories; num steps = 2 so that it views current and next observation in dataset
    dataset = replay_buffer.as_dataset(num_parallel_calls=3,
                                       sample_batch_size=batch_size,
                                       num_steps=2).prefetch(3)

    #create iterator for dataset to feed the agent
    iterator = iter(dataset)
    #print(iterator)

    #wrap in a graph for TF optimization
    agent.train = common.function(agent.train)

    agent.train_step_counter.assign(0)  #reset

    #Evaluate the initialized policy prior to training for baseline
    avg = avg_return(tf_env, agent.policy, num_eval_episodes)
    returns = [avg
               ]  #holds average returns from multiple points during training

    #main training loop
    for i in range(num_iterations):

        collect_data(tf_env, agent.collect_policy, replay_buffer,
                     collect_steps_per_iteration)

        #sample and update network
        exp, _ = next(iterator)
        loss = agent.train(exp).loss

        #get step
        step = agent.train_step_counter.numpy()

        #log progress or evaluate policy if needed (depending on hyperparameters)
        if (step % log_interval == 0):
            print('step = {0}: loss = {1}'.format(step, avg))

        if (step % eval_interval == 0):
            avg = avg_return(tf_env, agent.policy, num_eval_episodes)
            print('step = {0}: Average retrun = {1}'.format(step, avg))
            returns.append(avg)
            if (avg > -5):
                saver = policy_saver.PolicySaver(agent.policy, batch_size=None)
                saver.save('./models/policy' + str(i) + "$" + str(avg))

    #results

    #output runtime
    print("<><><>runtime: %s seconds<><><>" % (time.time() - start_time))

    #save the trained agent in the saved_model format for later use
    saver = policy_saver.PolicySaver(agent.policy, batch_size=None)
    saver.save('./models/policyF')
    tf.saved_model.save(agent, "./models/")

    #produce graph of training process
    iterations = range(0, num_iterations + 1, eval_interval)
    plt.plot(iterations, returns)
    plt.ylabel('Average Return')
    plt.xlabel('Iteration')
    plt.title('Average Return Over Time')
    plt.show()
fc_layer_params = (560, 60)
conv_layer_params = [(70, (8, 8), 4), (140, (4, 4), 2), (280, (3, 3), 1)]

q_net = q_network.QNetwork(nimble_quest_env.observation_spec(),
                           nimble_quest_env.action_spec(),
                           conv_layer_params=conv_layer_params,
                           fc_layer_params=fc_layer_params)

optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)
# global_step = tf.compat.v1.train.get_or_create_global_step()
global_step = tf.compat.v1.train.get_global_step()

#########################################################################
agent = dqn_agent.DdqnAgent(nimble_quest_env.time_step_spec(),
                            nimble_quest_env.action_spec(),
                            q_network=q_net,
                            optimizer=optimizer,
                            td_errors_loss_fn=common.element_wise_squared_loss,
                            train_step_counter=global_step)

agent.initialize()
##########################################################################

eval_policy = agent.policy
collect_policy = agent.collect_policy

replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=nimble_quest_env.batch_size,
    max_length=replay_buffer_max_length)

checkpoint_dir = os.path.join(tempdir, 'checkpoint')
Beispiel #8
0
    def __call__(self, trial):
        outs = []
        self.trial += 1
        # parameters
        # grey at the end were filtered by the first study

        batch_size = 32
        custom_last_layer = True
        doubleQ = True
        encoder_type = 3
        epsilon_greedy = 0.1
        gamma = 0.99
        gradient_clipping = True
        normalize_env = False
        rate = 0.1
        replay_buffer_max_length = 100000
        target_update_tau = 1
        learning_rate = 1e-4
        num_heads = 4
        initial_collect_steps = 500

        custom_layer_init = trial.suggest_categorical('custom_layer_init',
                                                      [0.5, 1])
        custom_lr_schedule = trial.suggest_categorical(
            'custom_lr_schedule',
            ["No", "Transformer"
             ])  #["Linear","No","Transformer","Transformer_low"]
        layer_type = trial.suggest_categorical('layer_type',
                                               [3, 6])  # [1,2,3,5,6,7]
        loss_function = trial.suggest_categorical(
            'loss_function',
            ["element_wise_huber_loss", "element_wise_squared_loss"])
        target_update_period = trial.suggest_categorical(
            'target_update_period', [5, 10])  #[5, 10,15]

        for x in range(self.args.n_trys):

            global_step = tf.Variable(0,
                                      trainable=False,
                                      dtype="int64",
                                      name="global_step")
            baseEnv = gym.make(self.args.env)
            env = suite_gym.load(self.args.env)
            eval_env = suite_gym.load(self.args.env)
            if normalize_env == True:
                env = NormalizeWrapper(env, self.args.approx_env_boundaries,
                                       self.args.env)
                eval_env = NormalizeWrapper(eval_env,
                                            self.args.approx_env_boundaries,
                                            self.args.env)
            env = PyhistoryWrapper(env, self.args.max_horizon, self.args.atari)
            eval_env = PyhistoryWrapper(eval_env, self.args.max_horizon,
                                        self.args.atari)
            tf_env = tf_py_environment.TFPyEnvironment(env)
            eval_tf_env = tf_py_environment.TFPyEnvironment(eval_env)

            q_net = QTransformer(tf_env.observation_spec(),
                                 baseEnv.action_space.n,
                                 num_layers=self.args.num_layers,
                                 d_model=self.args.d_model,
                                 num_heads=num_heads,
                                 dff=self.args.dff,
                                 rate=rate,
                                 encoderType=encoder_type,
                                 enc_layer_type=layer_type,
                                 max_horizon=self.args.max_horizon,
                                 custom_layer=custom_layer_init,
                                 custom_last_layer=custom_last_layer)

            if custom_lr_schedule == "Transformer":  # builds a lr schedule according to the original usage for the transformer
                learning_rate = CustomSchedule(
                    self.args.d_model, int(self.args.num_iterations / 10))
                optimizer = tf.keras.optimizers.Adam(learning_rate,
                                                     beta_1=0.9,
                                                     beta_2=0.98,
                                                     epsilon=1e-9)

            elif custom_lr_schedule == "Transformer_low":  # builds a lr schedule according to the original usage for the transformer
                learning_rate = CustomSchedule(
                    int(self.args.d_model / 2),
                    int(self.args.num_iterations /
                        10))  # --> same schedule with lower general lr
                optimizer = tf.keras.optimizers.Adam(learning_rate,
                                                     beta_1=0.9,
                                                     beta_2=0.98,
                                                     epsilon=1e-9)

            elif custom_lr_schedule == "Linear":
                lrs = LinearCustomSchedule(learning_rate,
                                           self.args.num_iterations)
                optimizer = tf.keras.optimizers.Adam(lrs,
                                                     beta_1=0.9,
                                                     beta_2=0.98,
                                                     epsilon=1e-9)

            else:
                optimizer = tf.compat.v1.train.AdamOptimizer(
                    learning_rate=learning_rate)

            if loss_function == "element_wise_huber_loss":
                lf = element_wise_huber_loss
            elif loss_function == "element_wise_squared_loss":
                lf = element_wise_squared_loss

            if doubleQ == False:  # global step count
                agent = dqn_agent.DqnAgent(
                    tf_env.time_step_spec(),
                    tf_env.action_spec(),
                    q_network=q_net,
                    epsilon_greedy=epsilon_greedy,
                    target_update_tau=target_update_tau,
                    target_update_period=target_update_period,
                    optimizer=optimizer,
                    gamma=gamma,
                    td_errors_loss_fn=lf,
                    reward_scale_factor=self.args.reward_scale_factor,
                    gradient_clipping=gradient_clipping,
                    debug_summaries=self.args.debug_summaries,
                    summarize_grads_and_vars=self.args.
                    summarize_grads_and_vars,
                    train_step_counter=global_step)
            else:
                agent = dqn_agent.DdqnAgent(
                    tf_env.time_step_spec(),
                    tf_env.action_spec(),
                    q_network=q_net,
                    epsilon_greedy=epsilon_greedy,
                    target_update_tau=target_update_tau,
                    target_update_period=target_update_period,
                    optimizer=optimizer,
                    gamma=gamma,
                    td_errors_loss_fn=lf,
                    reward_scale_factor=self.args.reward_scale_factor,
                    gradient_clipping=gradient_clipping,
                    debug_summaries=self.args.debug_summaries,
                    summarize_grads_and_vars=self.args.
                    summarize_grads_and_vars,
                    train_step_counter=global_step)
            agent.initialize()

            metric = train_eval_2(
                root_dir=os.path.join(self.args.output_dir,
                                      str(self.trial) + "_" + str(x)),
                num_eval_episodes=self.args.num_eval_episodes,
                tf_env=tf_env,
                eval_tf_env=eval_tf_env,
                agent=agent,
                eval_interval=self.args.eval_interval,
                summary_interval=self.args.summary_interval,
                num_iterations=self.args.num_iterations,
                initial_collect_steps=initial_collect_steps,
                collect_steps_per_iteration=self.args.
                collect_steps_per_iteration,
                replay_buffer_capacity=replay_buffer_max_length,
                train_steps_per_iteration=self.args.train_steps_per_iteration,
                batch_size=batch_size,
                use_tf_functions=self.args.run_graph_mode,
                log_interval=self.args.log_interval,
                global_step=global_step)

            outs.append(metric)

        return -np.mean(
            outs
        )  # since we are minimizing we need to take the negative reward sum
    def __init__(self):
        self._train_py_env = suite_gym.load(T48GymEnv.GYM_ENV_NAME, max_episode_steps=T48GymTensorflowContext.max_episode_steps)
        self._eval_py_env = suite_gym.load(T48GymEnv.GYM_ENV_NAME, max_episode_steps=T48GymTensorflowContext.max_episode_steps)
        self._train_env = tf_py_environment.TFPyEnvironment(self._train_py_env)
        self._eval_env = tf_py_environment.TFPyEnvironment(self._eval_py_env)

        self._global_step = tf.compat.v1.train.get_or_create_global_step()

        self._q_net = q_network.QNetwork(
          self._train_env.observation_spec(),
          self._train_env.action_spec(),
          fc_layer_params=(100,))
        self._agent = dqn_agent.DdqnAgent(
            self._train_env.time_step_spec(),
            self._train_env.action_spec(),
            q_network=self._q_net,
            optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=T48GymTensorflowContext.learning_rate),
            td_errors_loss_fn=common.element_wise_squared_loss,
            train_step_counter=self._global_step,
            epsilon_greedy=0.0)
        self._agent.initialize()
        self._agent.train = common.function(self._agent.train)

        self._replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
            data_spec=self._agent.collect_data_spec,
            batch_size=self._train_env.batch_size,
            max_length=T48GymTensorflowContext.replay_buffer_max_length)
        self._dataset = self._replay_buffer.as_dataset(
            num_parallel_calls=3,
            sample_batch_size=self._train_env.batch_size,
            num_steps=2).prefetch(3)
        self._agent.initialize()

        self._iterator = iter(self._dataset)

        self._RANDOM_POLICY = random_tf_policy.RandomTFPolicy(self._train_env.time_step_spec(),
                                                              self._train_env.action_spec())
        self._collect_policy = self._agent.collect_policy
        self._eval_policy = self._agent.policy

        self._collect_driver = dynamic_step_driver.DynamicStepDriver(
            self._train_env,
            self._collect_policy,
            observers=[self._replay_buffer.add_batch] + T48GymTensorflowContext.train_metrics,
            num_steps=2)

        self._train_checkpointer = common.Checkpointer(
            ckpt_dir=T48GymTensorflowContext.train_dir,
            global_step=self._global_step,
            agent=self._agent,
            metrics=metric_utils.MetricsGroup(T48GymTensorflowContext.train_metrics, 'train_metrics'))
        self._policy_checkpointer = common.Checkpointer(
            ckpt_dir=os.path.join(T48GymTensorflowContext.train_dir, 'policy'),
            global_step=self._global_step,
            policy=self._eval_policy)
        self._rb_checkpointer = common.Checkpointer(
            ckpt_dir=os.path.join(T48GymTensorflowContext.train_dir, 'replay_buffer'),
            max_to_keep=1,
            replay_buffer=self._replay_buffer)

        self._tf_policy_saver = policy_saver.PolicySaver(self._agent.policy)

        self._train_checkpointer.initialize_or_restore()
        self._policy_checkpointer.initialize_or_restore()
        self._rb_checkpointer.initialize_or_restore()
Beispiel #10
0
    def __call__(self, trial):
	    outs = []
	    self.trial+= 1 
	    # parameters
	    doubleQ = trial.suggest_categorical('doubleQ', [True, False]) 
	    custom_layer_init = trial.suggest_categorical('custom_layer_init', [None]) 
	    custom_last_layer = trial.suggest_categorical('custom_last_layer', [True,False]) 
	    initial_collect_steps = trial.suggest_categorical('initial_collect_steps', [100]) 
	    loss_function = trial.suggest_categorical('loss_function', ["element_wise_huber_loss","element_wise_squared_loss"]) 
	    num_heads = trial.suggest_categorical('num_heads', [2,4,8]) 
	    normalize_env = trial.suggest_categorical('normalize_env', [False]) # broken
	    custom_lr_schedule = trial.suggest_categorical('custom_lr_schedule', ["No"]) 
	    epsilon_greedy = trial.suggest_categorical('epsilon_greedy', [0.1, 0.2, 0.3])
	    target_update_period = trial.suggest_categorical('target_update_period', [5, 10, 15])
	    rate = trial.suggest_categorical('rate', [0.1,0.3])
	    gradient_clipping = trial.suggest_categorical('gradient_clipping', [True, False]) 
	    replay_buffer_max_length = trial.suggest_categorical('replay_buffer_max_length', [100000, 200000]) 
	    batch_size = trial.suggest_categorical('batch_size', [16,32,64,128]) 
	    learning_rate = trial.suggest_categorical('learning_rate', [1e-2,1e-3,1e-4,1e-5]) 
	    encoder_type = trial.suggest_categorical('encoder_type', [2,3]) 
	    layer_type = trial.suggest_categorical('layer_type', [1,2,3,5,6,7]) 
	    target_update_tau = trial.suggest_categorical('target_update_tau', [0.05,0.1])
	    gamma = trial.suggest_categorical('gamma', [0.99,0.95])  

	    for x in range(self.args.n_trys):

		    global_step = tf.Variable(0, trainable=False,dtype="int64",name= "global_step")
		    baseEnv = gym.make(self.args.env)
		    env = suite_gym.load(self.args.env)
		    eval_env = suite_gym.load(self.args.env)
		    if normalize_env == True:
		        env = NormalizeWrapper(env,self.args.approx_env_boundaries,self.args.env)
		        eval_env = NormalizeWrapper(eval_env,self.args.approx_env_boundaries,self.args.env)
		    env = PyhistoryWrapper(env,self.args.max_horizon,self.args.atari)
		    eval_env = PyhistoryWrapper(eval_env,self.args.max_horizon,self.args.atari)
		    tf_env = tf_py_environment.TFPyEnvironment(env)
		    eval_tf_env = tf_py_environment.TFPyEnvironment(eval_env)


		    q_net = QTransformer(
		        tf_env.observation_spec(),
		        baseEnv.action_space.n,
		        num_layers=self.args.num_layers,
		        d_model=self.args.d_model,
		        num_heads=num_heads, 
		        dff=self.args.dff,
		        rate = rate,
		        encoderType = encoder_type,
		        enc_layer_type=layer_type,
		        max_horizon=self.args.max_horizon,
		        custom_layer = custom_layer_init, 
		        custom_last_layer = custom_last_layer)

		    if custom_lr_schedule == "Transformer":    # builds a lr schedule according to the original usage for the transformer
		        learning_rate = CustomSchedule(self.args.d_model,int(self.args.num_iterations/10))
		        optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

		    elif custom_lr_schedule == "Transformer_low":    # builds a lr schedule according to the original usage for the transformer
		        learning_rate = CustomSchedule(int(self.args.d_model/2),int(self.args.num_iterations/10)) # --> same schedule with lower general lr
		        optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

		    elif custom_lr_schedule == "Linear": 
		        lrs = LinearCustomSchedule(learning_rate,self.args.num_iterations)
		        optimizer = tf.keras.optimizers.Adam(lrs, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

		    else:
		        optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)

		    if loss_function == "element_wise_huber_loss" :
		    	lf = element_wise_huber_loss
		    elif loss_function == "element_wise_squared_loss":
		    	lf = element_wise_squared_loss

		    if doubleQ == False:          # global step count
		        agent = dqn_agent.DqnAgent(
		            tf_env.time_step_spec(),
		            tf_env.action_spec(),
		            q_network=q_net,
		            epsilon_greedy=epsilon_greedy,
		            target_update_tau=target_update_tau,
		            target_update_period=target_update_period,
		            optimizer=optimizer,
		            gamma=gamma,
		            td_errors_loss_fn = lf,
		            reward_scale_factor=self.args.reward_scale_factor,
		            gradient_clipping=gradient_clipping,
		            debug_summaries=self.args.debug_summaries,
		            summarize_grads_and_vars=self.args.summarize_grads_and_vars,
		            train_step_counter=global_step)
		    else:
		        agent = dqn_agent.DdqnAgent(
		            tf_env.time_step_spec(),
		            tf_env.action_spec(),
		            q_network=q_net,
		            epsilon_greedy=epsilon_greedy,
		            target_update_tau=target_update_tau,
		            target_update_period=target_update_period,
		            optimizer=optimizer,
		            gamma=gamma,
		            td_errors_loss_fn = lf,
		            reward_scale_factor=self.args.reward_scale_factor,
		            gradient_clipping=gradient_clipping,
		            debug_summaries=self.args.debug_summaries,
		            summarize_grads_and_vars=self.args.summarize_grads_and_vars,
		            train_step_counter=global_step)
		    agent.initialize()

		    tf.profiler.experimental.start('profiler')

		    metric = train_eval_2(
		    root_dir = os.path.join(self.args.output_dir, str(self.trial) + "_" + str(x)),
		    num_eval_episodes = self.args.num_eval_episodes,
		    tf_env= tf_env,
		    eval_tf_env = eval_tf_env,
		    agent = agent, 
		    eval_interval = self.args.eval_interval,
		    summary_interval = self.args.summary_interval,
		    num_iterations=self.args.num_iterations,
		    initial_collect_steps= initial_collect_steps,
		    collect_steps_per_iteration= self.args.collect_steps_per_iteration,
		    replay_buffer_capacity= replay_buffer_max_length,
		    train_steps_per_iteration=self.args.train_steps_per_iteration,
		    batch_size = batch_size,
		    use_tf_functions=self.args.run_graph_mode,
		    log_interval = self.args.log_interval,
		    global_step = global_step)

		    tf.profiler.experimental.stop()

		    return 0
q_net = q_network.QNetwork(
        train_env.observation_spec(),
        train_env.action_spec(),
        fc_layer_params=fc_layer_params)



optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)

train_step_counter = tf.compat.v2.Variable(0)

tf_agent = dqn_agent.DdqnAgent(train_env.time_step_spec(),
        train_env.action_spec(),
        q_network=q_net,
        optimizer=optimizer,
        epsilon_greedy=0.01,
        td_errors_loss_fn=dqn_agent.element_wise_squared_loss,
        train_step_counter=train_step_counter)


print("ready to go")

random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(),
        train_env.action_spec())
print("policy")

def collect_step(environment, policy, random_policy, agent, max_t):
    """ """
    debug_mode = False
    environment.set_debug(False)
Beispiel #12
0
def main(arg, pars):
    """
    
    
    """
    print("load env ..")
    env_name = ("Car-v0")
    #env = gym.make("Car-v0")
    env = suite_gym.load(env_name,
                         discount=arg.gamma,
                         max_episode_steps=arg.max_t)
    print_parameter(arg, pars)
    train_py_env = suite_gym.load(env_name,
                                  discount=arg.gamma,
                                  max_episode_steps=arg.max_t)
    eval_py_env = suite_gym.load(env_name,
                                 discount=arg.gamma,
                                 max_episode_steps=arg.max_t)
    train_env = tf_py_environment.TFPyEnvironment(train_py_env)
    eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)
    print("env loaded")
    train_dir = os.path.join(arg.root_dir, 'network_weights')
    eval_dir = os.path.join(arg.root_dir, 'eval')

    train_env.reset()
    fc_layer_params = (arg.hidden_size_1, )
    q_net = q_network.QNetwork(train_env.observation_spec(),
                               train_env.action_spec(),
                               fc_layer_params=fc_layer_params)

    optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=arg.lr)
    train_step_counter = tf.compat.v2.Variable(0)

    tf_agent = dqn_agent.DdqnAgent(
        train_env.time_step_spec(),
        train_env.action_spec(),
        q_network=q_net,
        optimizer=optimizer,
        epsilon_greedy=arg.eps_start,
        td_errors_loss_fn=dqn_agent.element_wise_squared_loss,
        train_step_counter=train_step_counter)

    train_metrics = [
        tf_metrics.NumberOfEpisodes(),
        tf_metrics.EnvironmentSteps(),
        tf_metrics.AverageReturnMetric(),
        tf_metrics.AverageEpisodeLengthMetric(),
    ]

    global_step = tf.compat.v1.train.get_or_create_global_step()

    train_checkpointer = common.Checkpointer(ckpt_dir=train_dir,
                                             agent=tf_agent,
                                             global_step=global_step,
                                             metrics=metric_utils.MetricsGroup(
                                                 train_metrics,
                                                 'train_metrics'))

    if arg.continue_training == False:
        tf_agent.initialize()
        if os.path.exists("network_weights/*"):
            os.remove("network_weights/*")
    else:
        print("Continue Training")
        train_checkpointer.initialize_or_restore()
    print("ready to go")
    eval_policy = tf_agent.policy
    collect_policy = tf_agent.collect_policy
    random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(),
                                                    train_env.action_spec())
    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        data_spec=tf_agent.collect_data_spec,
        batch_size=train_env.batch_size,
        max_length=arg.buffer_size)
    tf_agent.collect_data_spec
    tf_agent.collect_data_spec._fields
    cv2.namedWindow("display", cv2.WINDOW_NORMAL)
    collect_data(train_env,
                 random_policy,
                 replay_buffer,
                 tf_agent,
                 steps=arg.learn_start,
                 max_t=40)
    print("create dataset")
    dataset = replay_buffer.as_dataset(num_parallel_calls=3,
                                       sample_batch_size=arg.batch_size,
                                       num_steps=2).prefetch(3)
    iterator = iter(dataset)

    # (Optional) Optimize by wrapping some of the code in a graph using TF function.
    tf_agent.train = common.function(tf_agent.train)
    # Reset the train step
    tf_agent.train_step_counter.assign(0)
    avg_return = compute_avg_return(eval_env, tf_agent.policy,
                                    arg.num_eval_episodes)
    returns = [avg_return]
    returns_average = [avg_return]
    train_loss_average = [1]
    score = 0
    scores_window = deque(maxlen=100)  # last 100 scores
    total_train_loss = deque(maxlen=100)  # last 100 scores

    train(arg, tf_agent, train_env, eval_env, replay_buffer, iterator,
          train_checkpointer)
Beispiel #13
0
def train_level(level,
                consecutive_wins_flag=5,
                collect_random_steps=True,
                max_iterations=num_iterations):
    """
    create DQN agent to train a level of the game
    :param level: level of the game
    :param consecutive_wins_flag: number of consecutive wins in evaluation
    signifying the training is done
    :param collect_random_steps: whether to collect random steps at the beginning,
    always set to 'True' when the global step is 0.
    :param max_iterations: stop the training when it reaches the max iteration
    regardless of the result
    """
    global saving_time
    cells = query_level(level)
    size = len(cells)
    env = tf_py_environment.TFPyEnvironment(GameEnv(size, cells))
    eval_env = tf_py_environment.TFPyEnvironment(GameEnv(size, cells))

    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

    fc_layer_params = (neuron_num_mapper[size], )

    q_net = q_network.QNetwork(env.observation_spec()[0],
                               env.action_spec(),
                               fc_layer_params=fc_layer_params,
                               activation_fn=tf.keras.activations.relu)

    global_step = tf.compat.v1.train.get_or_create_global_step()
    agent = dqn_agent.DdqnAgent(
        env.time_step_spec(),
        env.action_spec(),
        q_network=q_net,
        optimizer=optimizer,
        td_errors_loss_fn=common.element_wise_squared_loss,
        train_step_counter=global_step,
        observation_and_action_constraint_splitter=GameEnv.
        obs_and_mask_splitter)
    agent.initialize()

    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        data_spec=agent.collect_data_spec,
        batch_size=env.batch_size,
        max_length=replay_buffer_max_length)

    # drivers
    collect_driver = dynamic_step_driver.DynamicStepDriver(
        env,
        policy=agent.collect_policy,
        observers=[replay_buffer.add_batch],
        num_steps=collect_steps_per_iteration)

    eval_metrics = [
        tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes),
        tf_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes),
    ]

    eval_driver = dynamic_episode_driver.DynamicEpisodeDriver(
        eval_env,
        policy=agent.policy,
        observers=eval_metrics,
        num_episodes=num_eval_episodes)

    # checkpointer of the replay buffer and policy
    train_checkpointer = common.Checkpointer(ckpt_dir=os.path.join(
        dir_path, 'trained_policies/train_lv{0}'.format(level)),
                                             max_to_keep=1,
                                             agent=agent,
                                             policy=agent.policy,
                                             global_step=global_step,
                                             replay_buffer=replay_buffer)

    # policy saver
    tf_policy_saver = policy_saver.PolicySaver(agent.policy)

    train_checkpointer.initialize_or_restore()

    # optimize by wrapping some of the code in a graph using TF function
    agent.train = common.function(agent.train)
    collect_driver.run = common.function(collect_driver.run)
    eval_driver.run = common.function(eval_driver.run)

    # collect initial replay data
    if collect_random_steps:
        initial_collect_policy = random_tf_policy.RandomTFPolicy(
            time_step_spec=env.time_step_spec(),
            action_spec=env.action_spec(),
            observation_and_action_constraint_splitter=GameEnv.
            obs_and_mask_splitter)

        dynamic_step_driver.DynamicStepDriver(
            env,
            initial_collect_policy,
            observers=[replay_buffer.add_batch],
            num_steps=initial_collect_steps).run()

    # Dataset generates trajectories with shape [Bx2x...]
    dataset = replay_buffer.as_dataset(num_parallel_calls=3,
                                       sample_batch_size=batch_size,
                                       num_steps=2).prefetch(3)
    iterator = iter(dataset)

    # train the model until 5 consecutive evaluation have reward greater than 100
    consecutive_eval_win = 0
    train_iterations = 0
    while consecutive_eval_win < consecutive_wins_flag and train_iterations < max_iterations:
        collect_driver.run()

        for _ in range(collect_steps_per_iteration):
            experience, _ = next(iterator)
            train_loss = agent.train(experience).loss

        # evaluate the training at intervals
        step = global_step.numpy()
        if step % eval_interval == 0:
            eval_driver.run()
            average_return = eval_metrics[0].result().numpy()
            average_len = eval_metrics[1].result().numpy()
            print("level: {0} step: {1} AverageReturn: {2} AverageLen: {3}".
                  format(level, step, average_return, average_len))

            # evaluate consecutive wins
            if average_return > 10:
                consecutive_eval_win += 1
            else:
                consecutive_eval_win = 0

        if step % save_interval == 0:
            start = time.time()
            train_checkpointer.save(global_step=step)
            saving_time += time.time() - start

        train_iterations += 1

    # save the policy
    train_checkpointer.save(global_step=global_step.numpy())
    tf_policy_saver.save(
        os.path.join(dir_path, 'trained_policies/policy_lv{0}'.format(level)))
Beispiel #14
0
def create_agent(
        agent_class,
        environment,
        fc_layer_params,
        learning_rate,
        decaying_epsilon,
        n_step_update,
        target_update_tau,
        target_update_period,
        gamma,
        reward_scale_factor,
        gradient_clipping,
        debug_summaries,
        summarize_grads_and_vars,
        train_step_counter,
        num_atoms=None,  # Only for categorical_dqn
        min_q_value=None,  # Only for categorical_dqn
        max_q_value=None,  # Only for categorical_dqn
):
    """Creates the Hanabi agent.

	Args:
	  agent_class: str, type of agent to construct.
	  environment: The environment.
	  learning_rate: The Learning Rate
	  decaying_epsilon: Epsilon for Epsilon Greedy Policy
	  target_update_tau: Agent parameter
	  target_update_period: Agent parameter
	  gamma: Agent parameter
	  reward_scale_factor: Agent parameter
	  gradient_clipping: Agent parameter
	  debug_summaries: Agent parameter
	  summarize_grads_and_vars: Agent parameter
	  train_step_counter: The train step tf.Variable to be passed to agent


	Returns:
	  An agent for playing Hanabi.

	Raises:
	  ValueError: if an unknown agent type is requested.
	"""
    if agent_class == 'DQN':
        return dqn_agent.DqnAgent(
            environment.time_step_spec(),
            environment.action_spec(),
            q_network=q_network.QNetwork(
                environment.time_step_spec().observation['observations'],
                environment.action_spec(),
                fc_layer_params=fc_layer_params),
            optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
            observation_and_action_constraint_splitter=
            observation_and_action_constraint_splitter,
            epsilon_greedy=decaying_epsilon,
            n_step_update=n_step_update,
            target_update_tau=target_update_tau,
            target_update_period=target_update_period,
            td_errors_loss_fn=common.element_wise_squared_loss,
            gamma=gamma,
            reward_scale_factor=reward_scale_factor,
            gradient_clipping=gradient_clipping,
            debug_summaries=debug_summaries,
            summarize_grads_and_vars=summarize_grads_and_vars,
            train_step_counter=train_step_counter)
    elif agent_class == 'DDQN':
        return dqn_agent.DdqnAgent(
            environment.time_step_spec(),
            environment.action_spec(),
            q_network=q_network.QNetwork(
                environment.time_step_spec().observation['observations'],
                environment.action_spec(),
                fc_layer_params=fc_layer_params),
            optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
            observation_and_action_constraint_splitter=
            observation_and_action_constraint_splitter,
            epsilon_greedy=decaying_epsilon,
            n_step_update=n_step_update,
            target_update_tau=target_update_tau,
            target_update_period=target_update_period,
            td_errors_loss_fn=common.element_wise_squared_loss,
            gamma=gamma,
            reward_scale_factor=reward_scale_factor,
            gradient_clipping=gradient_clipping,
            debug_summaries=debug_summaries,
            summarize_grads_and_vars=summarize_grads_and_vars,
            train_step_counter=train_step_counter)
    elif agent_class == 'categorical_dqn':
        return categorical_dqn_agent.CategoricalDqnAgent(
            environment.time_step_spec(),
            environment.action_spec(),
            categorical_q_network=categorical_q_network.CategoricalQNetwork(
                environment.time_step_spec().observation['observations'],
                environment.action_spec(),
                num_atoms=num_atoms,
                fc_layer_params=fc_layer_params),
            optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
            observation_and_action_constraint_splitter=
            observation_and_action_constraint_splitter,
            epsilon_greedy=decaying_epsilon,
            n_step_update=n_step_update,
            target_update_tau=target_update_tau,
            target_update_period=target_update_period,
            min_q_value=min_q_value,
            max_q_value=max_q_value,
            td_errors_loss_fn=common.element_wise_squared_loss,
            gamma=gamma,
            reward_scale_factor=reward_scale_factor,
            gradient_clipping=gradient_clipping,
            debug_summaries=debug_summaries,
            summarize_grads_and_vars=summarize_grads_and_vars,
            train_step_counter=train_step_counter)
    else:
        raise ValueError(
            'Expected valid agent_type, got {}'.format(agent_class))
Beispiel #15
0
def main():
    parser = argparse.ArgumentParser()

    ## Essential parameters
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model stats and checkpoints will be written."
    )
    parser.add_argument("--env",
                        default=None,
                        type=str,
                        required=True,
                        help="The environment to train the agent on")
    parser.add_argument("--approx_env_boundaries",
                        default=False,
                        type=bool,
                        help="Whether to get the env boundaries approximately")
    parser.add_argument("--max_horizon", default=4, type=int)
    parser.add_argument("--atari",
                        default=True,
                        type=bool,
                        help="Gets some data Types correctly")

    ##agent parameters
    parser.add_argument("--reward_scale_factor", default=1.0, type=float)
    parser.add_argument("--debug_summaries", default=False, type=bool)
    parser.add_argument("--summarize_grads_and_vars", default=False, type=bool)

    ##transformer parameters
    parser.add_argument("--d_model", default=64, type=int)
    parser.add_argument("--num_layers", default=2, type=int)
    parser.add_argument("--dff", default=256, type=int)

    ##Training parameters
    parser.add_argument('--num_iterations',
                        type=int,
                        default=2000000,
                        help="steps in the env")
    parser.add_argument('--num_iparallel',
                        type=int,
                        default=1,
                        help="how many envs should run in parallel")
    parser.add_argument("--collect_steps_per_iteration", default=4, type=int)
    parser.add_argument("--train_steps_per_iteration", default=1, type=int)

    ## Other parameters
    parser.add_argument("--num_eval_episodes", default=10, type=int)
    parser.add_argument("--eval_interval", default=10000, type=int)
    parser.add_argument("--log_interval", default=10000, type=int)
    parser.add_argument("--summary_interval", default=10000, type=int)
    parser.add_argument("--run_graph_mode", default=True, type=bool)
    parser.add_argument("--checkpoint_interval", default=100000, type=int)
    parser.add_argument("--summary_flush", default=10,
                        type=int)  #what does this exactly do?

    # HP opt params
    parser.add_argument("--doubleQ",
                        default=True,
                        type=bool,
                        help="Whether to use a  DoubleQ agent")
    parser.add_argument("--custom_last_layer", default=True, type=bool)
    parser.add_argument("--custom_layer_init", default=1.0, type=float)
    parser.add_argument("--initial_collect_steps", default=50000, type=int)
    parser.add_argument("--loss_function",
                        default="element_wise_huber_loss",
                        type=str)
    parser.add_argument("--num_heads", default=4, type=int)
    parser.add_argument("--normalize_env", default=False, type=bool)
    parser.add_argument('--custom_lr_schedule',
                        default="No",
                        type=str,
                        help="whether to use a custom LR schedule")
    parser.add_argument("--epsilon_greedy", default=0.3, type=float)
    parser.add_argument("--target_update_period", default=10000, type=int)
    parser.add_argument(
        "--rate", default=0.1, type=float
    )  # dropout rate  (might be not used depending on the q network)  #Setting this to 0.0 somehow break the code. Not relevant tho just select a network without dropout
    parser.add_argument("--gradient_clipping", default=True, type=bool)
    parser.add_argument("--replay_buffer_max_length",
                        default=1000000,
                        type=int)
    parser.add_argument("--batch_size", default=32, type=int)
    parser.add_argument("--learning_rate", default=1e-4, type=float)
    parser.add_argument("--encoder_type",
                        default=3,
                        type=int,
                        help="Which Type of encoder is used for the model")
    parser.add_argument("--layer_type",
                        default=3,
                        type=int,
                        help="Which Type of layer is used for the encoder")
    parser.add_argument("--target_update_tau", default=1, type=float)
    parser.add_argument("--gamma", default=0.99, type=float)

    args = parser.parse_args()
    # List of encoder modules which we can use to change encoder based on a variable
    global_step = tf.compat.v1.train.get_or_create_global_step()

    baseEnv = gym.make(args.env)
    env = suite_gym.load(args.env, gym_kwargs={"frameskip": 4})
    eval_env = suite_gym.load(args.env, gym_kwargs={"frameskip": 4})
    if args.normalize_env == True:
        env = NormalizeWrapper(env, args.approx_env_boundaries, args.env)
        eval_env = NormalizeWrapper(eval_env, args.approx_env_boundaries,
                                    args.env)
    env = PyhistoryWrapper(env, args.max_horizon, args.atari)
    eval_env = PyhistoryWrapper(eval_env, args.max_horizon, args.atari)
    tf_env = tf_py_environment.TFPyEnvironment(env)
    eval_tf_env = tf_py_environment.TFPyEnvironment(eval_env)

    q_net = QTransformer(tf_env.observation_spec(),
                         baseEnv.action_space.n,
                         num_layers=args.num_layers,
                         d_model=args.d_model,
                         num_heads=args.num_heads,
                         dff=args.dff,
                         rate=args.rate,
                         encoderType=args.encoder_type,
                         enc_layer_type=args.layer_type,
                         max_horizon=args.max_horizon,
                         custom_layer=args.custom_layer_init,
                         custom_last_layer=args.custom_last_layer)

    if args.custom_lr_schedule == "Transformer":  # builds a lr schedule according to the original usage for the transformer
        learning_rate = CustomSchedule(args.d_model,
                                       int(args.num_iterations / 10))
        optimizer = tf.keras.optimizers.Adam(learning_rate,
                                             beta_1=0.9,
                                             beta_2=0.98,
                                             epsilon=1e-9)

    elif args.custom_lr_schedule == "Transformer_low":  # builds a lr schedule according to the original usage for the transformer
        learning_rate = CustomSchedule(
            int(args.d_model / 2),
            int(args.num_iterations /
                10))  # --> same schedule with lower general lr
        optimizer = tf.keras.optimizers.Adam(learning_rate,
                                             beta_1=0.9,
                                             beta_2=0.98,
                                             epsilon=1e-9)

    elif args.custom_lr_schedule == "Linear":
        lrs = LinearCustomSchedule(learning_rate, args.num_iterations)
        optimizer = tf.keras.optimizers.Adam(lrs,
                                             beta_1=0.9,
                                             beta_2=0.98,
                                             epsilon=1e-9)

    else:
        optimizer = tf.compat.v1.train.AdamOptimizer(
            learning_rate=args.learning_rate)

    if args.loss_function == "element_wise_huber_loss":
        lf = element_wise_huber_loss
    elif args.loss_function == "element_wise_squared_loss":
        lf = element_wise_squared_loss

    if args.doubleQ == False:  # global step count
        agent = dqn_agent.DqnAgent(
            tf_env.time_step_spec(),
            tf_env.action_spec(),
            q_network=q_net,
            epsilon_greedy=args.epsilon_greedy,
            target_update_tau=args.target_update_tau,
            target_update_period=args.target_update_period,
            td_errors_loss_fn=lf,
            optimizer=optimizer,
            gamma=args.gamma,
            reward_scale_factor=args.reward_scale_factor,
            gradient_clipping=args.gradient_clipping,
            debug_summaries=args.debug_summaries,
            summarize_grads_and_vars=args.summarize_grads_and_vars,
            train_step_counter=global_step)
    else:
        agent = dqn_agent.DdqnAgent(
            tf_env.time_step_spec(),
            tf_env.action_spec(),
            q_network=q_net,
            epsilon_greedy=args.epsilon_greedy,
            target_update_tau=args.target_update_tau,
            td_errors_loss_fn=lf,
            target_update_period=args.target_update_period,
            optimizer=optimizer,
            gamma=args.gamma,
            reward_scale_factor=args.reward_scale_factor,
            gradient_clipping=args.gradient_clipping,
            debug_summaries=args.debug_summaries,
            summarize_grads_and_vars=args.summarize_grads_and_vars,
            train_step_counter=global_step)
    agent.initialize()

    count_weights(q_net)

    train_eval(root_dir=args.output_dir,
               tf_env=tf_env,
               eval_tf_env=eval_tf_env,
               agent=agent,
               num_iterations=args.num_iterations,
               initial_collect_steps=args.initial_collect_steps,
               collect_steps_per_iteration=args.collect_steps_per_iteration,
               replay_buffer_capacity=args.replay_buffer_max_length,
               train_steps_per_iteration=args.train_steps_per_iteration,
               batch_size=args.batch_size,
               use_tf_functions=args.run_graph_mode,
               num_eval_episodes=args.num_eval_episodes,
               eval_interval=args.eval_interval,
               train_checkpoint_interval=args.checkpoint_interval,
               policy_checkpoint_interval=args.checkpoint_interval,
               rb_checkpoint_interval=args.checkpoint_interval,
               log_interval=args.log_interval,
               summary_interval=args.summary_interval,
               summaries_flush_secs=args.summary_flush)

    pickle.dump(args, open(args.output_dir + "/training_args.p", "wb"))
    print("Successfully trained and evaluation.")
Beispiel #16
0
def main():
    retro.data.Integrations.add_custom_path(
            os.path.join(SCRIPT_DIR, "custom_integrations")
    )
    print("PokemonRed-GameBoy" in retro.data.list_games(inttype=retro.data.Integrations.ALL))
    env = retro.make("PokemonRed-GameBoy", inttype=retro.data.Integrations.ALL)
    print(env)
    # tf_env = tf_py_environment.TFPyEnvironment(env)

    # printCounter = 0
    # if printCounter % 10000:
    #                 print("reward: ", rew)
    # printCounter += 1
    obs = env.reset()
    
    #get start time
    start_time = time.time()

    network = q_network.QNetwork(env.observation_spec(),
                             env.action_spec(),
                             fc_layer_params = (100,))

    #initialize the agent with the listed parameters
    agent = dqn_agent.DdqnAgent(env.time_step_spec(),
        env.action_spec(),
        q_network = network,
        optimizer = tf.optimizers.Adam(learning_rate = learning_rate),
        td_errors_loss_fn = common.element_wise_squared_loss,
        train_step_counter = tf.Variable(0),
        epsilon_greedy = epsilon,
        gamma = gamma,
        name = name)

    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(agent.collect_data_spec, 
                                                               batch_size = env.batch_size,
                                                               max_length = replay_buffer_capacity)

    
    #add an observer to add to the buffer
    replay_observer = [replay_buffer.add_batch]

    #create step driver
    #collect_op = dynamic_step_driver.DynamicStepDriver(env, agent.collect_policy, observers  = replay_observer, num_steps = 10).run()

    #create random policy to help generate dataset
    random_policy = random_tf_policy.RandomTFPolicy(env.time_step_spec(), env.action_spec())

    #populate replay buffer
    collect_data(env, random_policy, replay_buffer, initial_collect_steps) 

    #generate trajectories; num steps = 2 so that it views current and next observation in dataset
    dataset = replay_buffer.as_dataset(num_parallel_calls = 3,
                                       sample_batch_size = batch_size,
                                       num_steps = 2).prefetch(3)

    #create iterator for dataset to feed the agent
    iterator = iter(dataset)
    #print(iterator)

    #wrap in a graph for TF optimization
    agent.train = common.function(agent.train)

    agent.train_step_counter.assign(0) #reset

    #Evaluate the initialized policy prior to training for baseline
    avg = avg_return(env, agent.policy, num_eval_episodes)
    returns = [avg] #holds average returns from multiple points during training
        
    #main training loop
    for i in range(num_iterations):
        env.render()
        collect_data(env, agent.collect_policy, replay_buffer, collect_steps_per_iteration)

        #sample and update network
        exp, _ = next(iterator)
        loss = agent.train(exp).loss

        #get step
        step = agent.train_step_counter.numpy()

        #log progress or evaluate policy if needed (depending on hyperparameters)
        if(step % log_interval == 0):
            print('step = {0}: loss = {1}'.format(step, avg))

        if(step % eval_interval == 0):
            avg = avg_return(env, agent.policy, num_eval_episodes)
            print('step = {0}: Average retrun = {1}'.format(step, avg))
            returns.append(avg)


    #produce graph of training process
    iterations = range(0, num_iterations+1, eval_interval)
    plt.plot(iterations, returns)
    plt.ylabel('Average Return')
    plt.xlabel('Iteration')
    plt.title('Average Return Over Time')
    plt.show()
Beispiel #17
0
optimizer = tf.keras.optimizers.RMSprop(lr=2.5e-4,
                                        rho=0.95,
                                        momentum=0.0,
                                        epsilon=0.00001,
                                        centered=True)

epsilon_fn = tf.keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=1.0,
    decay_steps=250000 // update_period,
    end_learning_rate=0.01)

agent = dqn_agent.DdqnAgent(
    train_tf_env.time_step_spec(),
    train_tf_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    target_update_period=2000,
    td_errors_loss_fn=tf.keras.losses.Huber(reduction="none"),
    gamma=0.99,
    train_step_counter=train_step,
    epsilon_greedy=lambda: epsilon_fn(train_step))

agent.initialize()

replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=train_tf_env.batch_size,
    max_length=1000000)

replay_buffer_observer = replay_buffer.add_batch

train_metrics = [
Beispiel #18
0
                           fc_layer_params=fc_layer_params)

optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate,
                                             epsilon=adam_epsilon)

train_step_counter = tf.Variable(0)

epsilon = tf.compat.v1.train.polynomial_decay(start_epsilon,
                                              train_step_counter,
                                              num_iterations,
                                              end_learning_rate=end_epsilon)

agent = dqn_agent.DdqnAgent(train_game_env.time_step_spec(),
                            train_game_env.action_spec(),
                            q_network=q_net,
                            epsilon_greedy=epsilon,
                            optimizer=optimizer,
                            target_update_period=target_update_period,
                            td_errors_loss_fn=common.element_wise_squared_loss,
                            train_step_counter=train_step_counter)

agent.initialize()

# DEFINE METRICS ETC. (see tf-agents DQN tutorial)


def compute_avg_return(environment, policy, num_episodes=10):

    total_return = 0.0
    for i in range(num_episodes):

        time_step = environment.reset()