def _agent(self, cfg): """return a TFDqnAgent""" net = self._net(cfg["network"]) optimizer = cfg["optimizer"]["optimizer"]( learning_rate=cfg["optimizer"]["learning_rate"]) loss_fn = cfg["optimizer"]["loss_fn"] if cfg["type"].lower() == "dqn": return dqn_agent.DqnAgent( self.env.time_step_spec(), self.env.action_spec(), q_network=net, optimizer=optimizer, td_errors_loss_fn=loss_fn, target_update_tau=cfg["target"]["soft"], target_update_period=cfg["target"]["period"], gamma=cfg["gamma"], reward_scale_factor=1.0, train_step_counter=tf.Variable(0)) elif cfg["type"].lower() == "ddqn": return dqn_agent.DdqnAgent( self.env.time_step_spec(), self.env.action_spec(), q_network=net, optimizer=optimizer, td_errors_loss_fn=loss_fn, target_update_tau=cfg["target"]["soft"], target_update_period=cfg["target"]["period"], gamma=cfg["gamma"], reward_scale_factor=1.0, train_step_counter=tf.Variable(0)) else: raise ValueError("Unknown type of agent! Input type: {}".format( cfg["type"]))
def init_agent(): """ a DQN agent is set by default in the application""" # get the global step global_step = tf.compat.v1.train.get_or_create_global_step() # TODO: update this to get the optimizer from tensorflow 2.0 if possible optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=learning_rate) q_net = q_network.QNetwork(self._rl_app.observation_spec, self._rl_app.action_spec, fc_layer_params=fc_layer_params) time_step_spec = ts.time_step_spec(self._rl_app.observation_spec) tf_agent = dqn_agent.DdqnAgent( time_step_spec, self._rl_app.action_spec, q_network=q_net, optimizer=optimizer, epsilon_greedy=eps_final, gradient_clipping=gradient_clipping, td_errors_loss_fn=common.element_wise_squared_loss, train_step_counter=global_step, debug_summaries=True, summarize_grads_and_vars=True) tf_agent.initialize() logger.info("tf_agent initialization is complete") # Optimize by wrapping some of the code in a graph using TF function. tf_agent.train = common.function(tf_agent.train) return tf_agent
def create_agent(observation_spec, action_spec, time_step_spec, step_counter, use_double_q=False): """Creates a DQN/DQRNN agent.""" train_sequence_length = FLAGS.train_sequence_length if train_sequence_length > 1: q_net = q_rnn_network.RnnNetwork( observation_spec, action_spec, input_fc_layer_params=parse_str_flag(FLAGS.input_fc_layers), cell_type=FLAGS.network_type, hidden_size=parse_str_flag(FLAGS.hidden_sizes), output_fc_layer_params=parse_str_flag(FLAGS.output_fc_layers)) else: q_net = q_network.QNetwork( observation_spec, action_spec, fc_layer_params=parse_str_flag(FLAGS.fc_layers)) train_sequence_length = FLAGS.n_step_update if FLAGS.use_double_q: tf_agent = dqn_agent.DdqnAgent( time_step_spec, action_spec, q_network=q_net, epsilon_greedy=FLAGS.epsilon_greedy, n_step_update=FLAGS.n_step_update, boltzmann_temperature=FLAGS.boltzmann_temperature, target_update_tau=FLAGS.target_update_tau, target_update_period=FLAGS.target_update_period, optimizer=tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate), td_errors_loss_fn=common.element_wise_squared_loss, gamma=FLAGS.gamma, gradient_clipping=FLAGS.gradient_clipping, debug_summaries=FLAGS.debug_summaries, summarize_grads_and_vars=FLAGS.summarize_grads_and_vars, train_step_counter=step_counter) else: tf_agent = dqn_agent.DqnAgent( time_step_spec, action_spec, q_network=q_net, epsilon_greedy=FLAGS.epsilon_greedy, n_step_update=FLAGS.n_step_update, boltzmann_temperature=FLAGS.boltzmann_temperature, target_update_tau=FLAGS.target_update_tau, target_update_period=FLAGS.target_update_period, optimizer=tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate), td_errors_loss_fn=common.element_wise_squared_loss, gamma=FLAGS.gamma, gradient_clipping=FLAGS.gradient_clipping, debug_summaries=FLAGS.debug_summaries, summarize_grads_and_vars=FLAGS.summarize_grads_and_vars, train_step_counter=step_counter) return tf_agent, train_sequence_length
def create_tf_ddqn_agent(self, q_network, alpha, gamma, epsilon, init_temp, cooldown_time): optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=alpha) train_step_counter = tf.Variable(0) _temp = tf.Variable(np.float(init_temp)) - tf.dtypes.cast( train_step_counter, tf.float32) * tf.Variable( (init_temp - 1) / (STEP_ITERATIONS * cooldown_time)) if epsilon is not None: _temp = None return dqn_agent.DdqnAgent(self.tf_training_env.time_step_spec(), self.tf_training_env.action_spec(), q_network=q_network, optimizer=optimizer, gamma=gamma, epsilon_greedy=epsilon, boltzmann_temperature=_temp, td_errors_loss_fn=element_wise_huber_loss, train_step_counter=train_step_counter, gradient_clipping=10.0)
def create_agent(self): # a deep neural network to learn Q(s,a) q_net = q_network.QNetwork( self._train_env.observation_spec(), self._train_env.action_spec(), #conv_layer_params= param.QNET_CONV_LAYERS, fc_layer_params=param.QNET_FC_LAYERS) # optional copunter that increments every time the train op is run self._train_step = tf.Variable(0) # an adaptive learning rate for gradient descent optimizer = tf.keras.optimizers.Adam(lr=param.ADAM_LR, epsilon=param.ADAM_EPSILON) # probability of exploration as a function of time steps epsilon_fn = tf.keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=param.DECAY_LR_INIT, decay_steps=param.DECAY_STEPS // param.DECAY_UPDATE_PERIOD, end_learning_rate=param.DECAY_LR_END) # create the double deep Q learning network agent self._agent = dqn_agent.DdqnAgent( self._train_env.time_step_spec(), self._train_env.action_spec(), q_network=q_net, optimizer=optimizer, # period for soft update of the target networks target_update_period=param.AGENT_UPDATE_PERIOD, # loss function for gradient descent td_errors_loss_fn=tf.keras.losses.Huber(reduction="none"), # a discount factor for future rewards. gamma=param.AGENT_GAMMA, # optional copunter that increments every time the train op is run train_step_counter=self._train_step, epsilon_greedy=lambda: epsilon_fn(self._train_step)) self._agent.initialize()
class TrainAndSaveModel(network.Network): # #HYPERPARAMETERS # num_iterations = 200000 #number of batches in an epoch(a single passthrough of a dataset) # initial_collect_steps = 1500 collect_steps_per_iteration = 1 replay_buffer_capacity = 250000 # batch_size = 100 #number of training examples before updating model learning_rate = 0.000075 #a measure of how resistant a model is to change (important) log_interval = 500 #for printing progress during training # num_eval_episodes = 15 eval_interval = 1000 #for deciding when to add a data point of progress # epsilon = 0.07 #probability of choosing a random action to avoid over/under fitting of model gamma = 1.0 #dicount factor for future rewards name = "BlackjackSavant" #END OF HYPERPARAMETERS # #HELPER METHODS # #records the data that results from executing the specified policy in the environment into the buffer #def collect_step(env, policy, buffer): #record data over specified number of steps def collect_data(env, policy, buffer, steps): for _ in range(steps): time_step = env.current_time_step() action_step = policy.action(time_step) next_time_step = env.step(action_step.action) traject = trajectory.from_transition(time_step, action_step, next_time_step) buffer.add_batch(traject) #average the reward gained by the policy def avg_return(env, policy, num_episodes=10): total_return = 0.0 for _ in range(num_episodes): time_step = env.reset() episode_return = 0.0 while (not time_step.is_last()): action_step = policy.action(time_step) time_step = env.step(action_step.action) episode_return += time_step.reward total_return += episode_return avg_return = total_return / num_episodes return avg_return.numpy()[0] #END OF HELPER METHODS # #MAIN EXECUTION # #get start time start_time = time.time() #initialize environment and wrap it in tf environment env = benv.BlackjackEnv() tf_env = tf_py_environment.TFPyEnvironment(env) #initialize network environment network = q_network.QNetwork(tf_env.observation_spec(), tf_env.action_spec(), fc_layer_params=(100, )) #initialize the agent with the listed parameters agent = dqn_agent.DdqnAgent( tf_env.time_step_spec(), tf_env.action_spec(), q_network=network, optimizer=tf.optimizers.Adam(learning_rate=learning_rate), td_errors_loss_fn=common.element_wise_squared_loss, train_step_counter=tf.Variable(0), epsilon_greedy=epsilon, gamma=gamma, name=name) agent.initialize() #create replay buffer to keep track of training replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( agent.collect_data_spec, batch_size=tf_env.batch_size, max_length=replay_buffer_capacity) #add an observer to add to the buffer replay_observer = [replay_buffer.add_batch] #create step driver #collect_op = dynamic_step_driver.DynamicStepDriver(tf_env, agent.collect_policy, observers = replay_observer, num_steps = 10).run() #create random policy to help generate dataset random_policy = random_tf_policy.RandomTFPolicy(tf_env.time_step_spec(), tf_env.action_spec()) #populate replay buffer collect_data(tf_env, random_policy, replay_buffer, initial_collect_steps) #generate trajectories; num steps = 2 so that it views current and next observation in dataset dataset = replay_buffer.as_dataset(num_parallel_calls=3, sample_batch_size=batch_size, num_steps=2).prefetch(3) #create iterator for dataset to feed the agent iterator = iter(dataset) #print(iterator) #wrap in a graph for TF optimization agent.train = common.function(agent.train) agent.train_step_counter.assign(0) #reset #Evaluate the initialized policy prior to training for baseline avg = avg_return(tf_env, agent.policy, num_eval_episodes) returns = [avg ] #holds average returns from multiple points during training #main training loop for i in range(num_iterations): collect_data(tf_env, agent.collect_policy, replay_buffer, collect_steps_per_iteration) #sample and update network exp, _ = next(iterator) loss = agent.train(exp).loss #get step step = agent.train_step_counter.numpy() #log progress or evaluate policy if needed (depending on hyperparameters) if (step % log_interval == 0): print('step = {0}: loss = {1}'.format(step, avg)) if (step % eval_interval == 0): avg = avg_return(tf_env, agent.policy, num_eval_episodes) print('step = {0}: Average retrun = {1}'.format(step, avg)) returns.append(avg) if (avg > -5): saver = policy_saver.PolicySaver(agent.policy, batch_size=None) saver.save('./models/policy' + str(i) + "$" + str(avg)) #results #output runtime print("<><><>runtime: %s seconds<><><>" % (time.time() - start_time)) #save the trained agent in the saved_model format for later use saver = policy_saver.PolicySaver(agent.policy, batch_size=None) saver.save('./models/policyF') tf.saved_model.save(agent, "./models/") #produce graph of training process iterations = range(0, num_iterations + 1, eval_interval) plt.plot(iterations, returns) plt.ylabel('Average Return') plt.xlabel('Iteration') plt.title('Average Return Over Time') plt.show()
fc_layer_params = (560, 60) conv_layer_params = [(70, (8, 8), 4), (140, (4, 4), 2), (280, (3, 3), 1)] q_net = q_network.QNetwork(nimble_quest_env.observation_spec(), nimble_quest_env.action_spec(), conv_layer_params=conv_layer_params, fc_layer_params=fc_layer_params) optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate) # global_step = tf.compat.v1.train.get_or_create_global_step() global_step = tf.compat.v1.train.get_global_step() ######################################################################### agent = dqn_agent.DdqnAgent(nimble_quest_env.time_step_spec(), nimble_quest_env.action_spec(), q_network=q_net, optimizer=optimizer, td_errors_loss_fn=common.element_wise_squared_loss, train_step_counter=global_step) agent.initialize() ########################################################################## eval_policy = agent.policy collect_policy = agent.collect_policy replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=agent.collect_data_spec, batch_size=nimble_quest_env.batch_size, max_length=replay_buffer_max_length) checkpoint_dir = os.path.join(tempdir, 'checkpoint')
def __call__(self, trial): outs = [] self.trial += 1 # parameters # grey at the end were filtered by the first study batch_size = 32 custom_last_layer = True doubleQ = True encoder_type = 3 epsilon_greedy = 0.1 gamma = 0.99 gradient_clipping = True normalize_env = False rate = 0.1 replay_buffer_max_length = 100000 target_update_tau = 1 learning_rate = 1e-4 num_heads = 4 initial_collect_steps = 500 custom_layer_init = trial.suggest_categorical('custom_layer_init', [0.5, 1]) custom_lr_schedule = trial.suggest_categorical( 'custom_lr_schedule', ["No", "Transformer" ]) #["Linear","No","Transformer","Transformer_low"] layer_type = trial.suggest_categorical('layer_type', [3, 6]) # [1,2,3,5,6,7] loss_function = trial.suggest_categorical( 'loss_function', ["element_wise_huber_loss", "element_wise_squared_loss"]) target_update_period = trial.suggest_categorical( 'target_update_period', [5, 10]) #[5, 10,15] for x in range(self.args.n_trys): global_step = tf.Variable(0, trainable=False, dtype="int64", name="global_step") baseEnv = gym.make(self.args.env) env = suite_gym.load(self.args.env) eval_env = suite_gym.load(self.args.env) if normalize_env == True: env = NormalizeWrapper(env, self.args.approx_env_boundaries, self.args.env) eval_env = NormalizeWrapper(eval_env, self.args.approx_env_boundaries, self.args.env) env = PyhistoryWrapper(env, self.args.max_horizon, self.args.atari) eval_env = PyhistoryWrapper(eval_env, self.args.max_horizon, self.args.atari) tf_env = tf_py_environment.TFPyEnvironment(env) eval_tf_env = tf_py_environment.TFPyEnvironment(eval_env) q_net = QTransformer(tf_env.observation_spec(), baseEnv.action_space.n, num_layers=self.args.num_layers, d_model=self.args.d_model, num_heads=num_heads, dff=self.args.dff, rate=rate, encoderType=encoder_type, enc_layer_type=layer_type, max_horizon=self.args.max_horizon, custom_layer=custom_layer_init, custom_last_layer=custom_last_layer) if custom_lr_schedule == "Transformer": # builds a lr schedule according to the original usage for the transformer learning_rate = CustomSchedule( self.args.d_model, int(self.args.num_iterations / 10)) optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) elif custom_lr_schedule == "Transformer_low": # builds a lr schedule according to the original usage for the transformer learning_rate = CustomSchedule( int(self.args.d_model / 2), int(self.args.num_iterations / 10)) # --> same schedule with lower general lr optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) elif custom_lr_schedule == "Linear": lrs = LinearCustomSchedule(learning_rate, self.args.num_iterations) optimizer = tf.keras.optimizers.Adam(lrs, beta_1=0.9, beta_2=0.98, epsilon=1e-9) else: optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=learning_rate) if loss_function == "element_wise_huber_loss": lf = element_wise_huber_loss elif loss_function == "element_wise_squared_loss": lf = element_wise_squared_loss if doubleQ == False: # global step count agent = dqn_agent.DqnAgent( tf_env.time_step_spec(), tf_env.action_spec(), q_network=q_net, epsilon_greedy=epsilon_greedy, target_update_tau=target_update_tau, target_update_period=target_update_period, optimizer=optimizer, gamma=gamma, td_errors_loss_fn=lf, reward_scale_factor=self.args.reward_scale_factor, gradient_clipping=gradient_clipping, debug_summaries=self.args.debug_summaries, summarize_grads_and_vars=self.args. summarize_grads_and_vars, train_step_counter=global_step) else: agent = dqn_agent.DdqnAgent( tf_env.time_step_spec(), tf_env.action_spec(), q_network=q_net, epsilon_greedy=epsilon_greedy, target_update_tau=target_update_tau, target_update_period=target_update_period, optimizer=optimizer, gamma=gamma, td_errors_loss_fn=lf, reward_scale_factor=self.args.reward_scale_factor, gradient_clipping=gradient_clipping, debug_summaries=self.args.debug_summaries, summarize_grads_and_vars=self.args. summarize_grads_and_vars, train_step_counter=global_step) agent.initialize() metric = train_eval_2( root_dir=os.path.join(self.args.output_dir, str(self.trial) + "_" + str(x)), num_eval_episodes=self.args.num_eval_episodes, tf_env=tf_env, eval_tf_env=eval_tf_env, agent=agent, eval_interval=self.args.eval_interval, summary_interval=self.args.summary_interval, num_iterations=self.args.num_iterations, initial_collect_steps=initial_collect_steps, collect_steps_per_iteration=self.args. collect_steps_per_iteration, replay_buffer_capacity=replay_buffer_max_length, train_steps_per_iteration=self.args.train_steps_per_iteration, batch_size=batch_size, use_tf_functions=self.args.run_graph_mode, log_interval=self.args.log_interval, global_step=global_step) outs.append(metric) return -np.mean( outs ) # since we are minimizing we need to take the negative reward sum
def __init__(self): self._train_py_env = suite_gym.load(T48GymEnv.GYM_ENV_NAME, max_episode_steps=T48GymTensorflowContext.max_episode_steps) self._eval_py_env = suite_gym.load(T48GymEnv.GYM_ENV_NAME, max_episode_steps=T48GymTensorflowContext.max_episode_steps) self._train_env = tf_py_environment.TFPyEnvironment(self._train_py_env) self._eval_env = tf_py_environment.TFPyEnvironment(self._eval_py_env) self._global_step = tf.compat.v1.train.get_or_create_global_step() self._q_net = q_network.QNetwork( self._train_env.observation_spec(), self._train_env.action_spec(), fc_layer_params=(100,)) self._agent = dqn_agent.DdqnAgent( self._train_env.time_step_spec(), self._train_env.action_spec(), q_network=self._q_net, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=T48GymTensorflowContext.learning_rate), td_errors_loss_fn=common.element_wise_squared_loss, train_step_counter=self._global_step, epsilon_greedy=0.0) self._agent.initialize() self._agent.train = common.function(self._agent.train) self._replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=self._agent.collect_data_spec, batch_size=self._train_env.batch_size, max_length=T48GymTensorflowContext.replay_buffer_max_length) self._dataset = self._replay_buffer.as_dataset( num_parallel_calls=3, sample_batch_size=self._train_env.batch_size, num_steps=2).prefetch(3) self._agent.initialize() self._iterator = iter(self._dataset) self._RANDOM_POLICY = random_tf_policy.RandomTFPolicy(self._train_env.time_step_spec(), self._train_env.action_spec()) self._collect_policy = self._agent.collect_policy self._eval_policy = self._agent.policy self._collect_driver = dynamic_step_driver.DynamicStepDriver( self._train_env, self._collect_policy, observers=[self._replay_buffer.add_batch] + T48GymTensorflowContext.train_metrics, num_steps=2) self._train_checkpointer = common.Checkpointer( ckpt_dir=T48GymTensorflowContext.train_dir, global_step=self._global_step, agent=self._agent, metrics=metric_utils.MetricsGroup(T48GymTensorflowContext.train_metrics, 'train_metrics')) self._policy_checkpointer = common.Checkpointer( ckpt_dir=os.path.join(T48GymTensorflowContext.train_dir, 'policy'), global_step=self._global_step, policy=self._eval_policy) self._rb_checkpointer = common.Checkpointer( ckpt_dir=os.path.join(T48GymTensorflowContext.train_dir, 'replay_buffer'), max_to_keep=1, replay_buffer=self._replay_buffer) self._tf_policy_saver = policy_saver.PolicySaver(self._agent.policy) self._train_checkpointer.initialize_or_restore() self._policy_checkpointer.initialize_or_restore() self._rb_checkpointer.initialize_or_restore()
def __call__(self, trial): outs = [] self.trial+= 1 # parameters doubleQ = trial.suggest_categorical('doubleQ', [True, False]) custom_layer_init = trial.suggest_categorical('custom_layer_init', [None]) custom_last_layer = trial.suggest_categorical('custom_last_layer', [True,False]) initial_collect_steps = trial.suggest_categorical('initial_collect_steps', [100]) loss_function = trial.suggest_categorical('loss_function', ["element_wise_huber_loss","element_wise_squared_loss"]) num_heads = trial.suggest_categorical('num_heads', [2,4,8]) normalize_env = trial.suggest_categorical('normalize_env', [False]) # broken custom_lr_schedule = trial.suggest_categorical('custom_lr_schedule', ["No"]) epsilon_greedy = trial.suggest_categorical('epsilon_greedy', [0.1, 0.2, 0.3]) target_update_period = trial.suggest_categorical('target_update_period', [5, 10, 15]) rate = trial.suggest_categorical('rate', [0.1,0.3]) gradient_clipping = trial.suggest_categorical('gradient_clipping', [True, False]) replay_buffer_max_length = trial.suggest_categorical('replay_buffer_max_length', [100000, 200000]) batch_size = trial.suggest_categorical('batch_size', [16,32,64,128]) learning_rate = trial.suggest_categorical('learning_rate', [1e-2,1e-3,1e-4,1e-5]) encoder_type = trial.suggest_categorical('encoder_type', [2,3]) layer_type = trial.suggest_categorical('layer_type', [1,2,3,5,6,7]) target_update_tau = trial.suggest_categorical('target_update_tau', [0.05,0.1]) gamma = trial.suggest_categorical('gamma', [0.99,0.95]) for x in range(self.args.n_trys): global_step = tf.Variable(0, trainable=False,dtype="int64",name= "global_step") baseEnv = gym.make(self.args.env) env = suite_gym.load(self.args.env) eval_env = suite_gym.load(self.args.env) if normalize_env == True: env = NormalizeWrapper(env,self.args.approx_env_boundaries,self.args.env) eval_env = NormalizeWrapper(eval_env,self.args.approx_env_boundaries,self.args.env) env = PyhistoryWrapper(env,self.args.max_horizon,self.args.atari) eval_env = PyhistoryWrapper(eval_env,self.args.max_horizon,self.args.atari) tf_env = tf_py_environment.TFPyEnvironment(env) eval_tf_env = tf_py_environment.TFPyEnvironment(eval_env) q_net = QTransformer( tf_env.observation_spec(), baseEnv.action_space.n, num_layers=self.args.num_layers, d_model=self.args.d_model, num_heads=num_heads, dff=self.args.dff, rate = rate, encoderType = encoder_type, enc_layer_type=layer_type, max_horizon=self.args.max_horizon, custom_layer = custom_layer_init, custom_last_layer = custom_last_layer) if custom_lr_schedule == "Transformer": # builds a lr schedule according to the original usage for the transformer learning_rate = CustomSchedule(self.args.d_model,int(self.args.num_iterations/10)) optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) elif custom_lr_schedule == "Transformer_low": # builds a lr schedule according to the original usage for the transformer learning_rate = CustomSchedule(int(self.args.d_model/2),int(self.args.num_iterations/10)) # --> same schedule with lower general lr optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) elif custom_lr_schedule == "Linear": lrs = LinearCustomSchedule(learning_rate,self.args.num_iterations) optimizer = tf.keras.optimizers.Adam(lrs, beta_1=0.9, beta_2=0.98, epsilon=1e-9) else: optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate) if loss_function == "element_wise_huber_loss" : lf = element_wise_huber_loss elif loss_function == "element_wise_squared_loss": lf = element_wise_squared_loss if doubleQ == False: # global step count agent = dqn_agent.DqnAgent( tf_env.time_step_spec(), tf_env.action_spec(), q_network=q_net, epsilon_greedy=epsilon_greedy, target_update_tau=target_update_tau, target_update_period=target_update_period, optimizer=optimizer, gamma=gamma, td_errors_loss_fn = lf, reward_scale_factor=self.args.reward_scale_factor, gradient_clipping=gradient_clipping, debug_summaries=self.args.debug_summaries, summarize_grads_and_vars=self.args.summarize_grads_and_vars, train_step_counter=global_step) else: agent = dqn_agent.DdqnAgent( tf_env.time_step_spec(), tf_env.action_spec(), q_network=q_net, epsilon_greedy=epsilon_greedy, target_update_tau=target_update_tau, target_update_period=target_update_period, optimizer=optimizer, gamma=gamma, td_errors_loss_fn = lf, reward_scale_factor=self.args.reward_scale_factor, gradient_clipping=gradient_clipping, debug_summaries=self.args.debug_summaries, summarize_grads_and_vars=self.args.summarize_grads_and_vars, train_step_counter=global_step) agent.initialize() tf.profiler.experimental.start('profiler') metric = train_eval_2( root_dir = os.path.join(self.args.output_dir, str(self.trial) + "_" + str(x)), num_eval_episodes = self.args.num_eval_episodes, tf_env= tf_env, eval_tf_env = eval_tf_env, agent = agent, eval_interval = self.args.eval_interval, summary_interval = self.args.summary_interval, num_iterations=self.args.num_iterations, initial_collect_steps= initial_collect_steps, collect_steps_per_iteration= self.args.collect_steps_per_iteration, replay_buffer_capacity= replay_buffer_max_length, train_steps_per_iteration=self.args.train_steps_per_iteration, batch_size = batch_size, use_tf_functions=self.args.run_graph_mode, log_interval = self.args.log_interval, global_step = global_step) tf.profiler.experimental.stop() return 0
q_net = q_network.QNetwork( train_env.observation_spec(), train_env.action_spec(), fc_layer_params=fc_layer_params) optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate) train_step_counter = tf.compat.v2.Variable(0) tf_agent = dqn_agent.DdqnAgent(train_env.time_step_spec(), train_env.action_spec(), q_network=q_net, optimizer=optimizer, epsilon_greedy=0.01, td_errors_loss_fn=dqn_agent.element_wise_squared_loss, train_step_counter=train_step_counter) print("ready to go") random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(), train_env.action_spec()) print("policy") def collect_step(environment, policy, random_policy, agent, max_t): """ """ debug_mode = False environment.set_debug(False)
def main(arg, pars): """ """ print("load env ..") env_name = ("Car-v0") #env = gym.make("Car-v0") env = suite_gym.load(env_name, discount=arg.gamma, max_episode_steps=arg.max_t) print_parameter(arg, pars) train_py_env = suite_gym.load(env_name, discount=arg.gamma, max_episode_steps=arg.max_t) eval_py_env = suite_gym.load(env_name, discount=arg.gamma, max_episode_steps=arg.max_t) train_env = tf_py_environment.TFPyEnvironment(train_py_env) eval_env = tf_py_environment.TFPyEnvironment(eval_py_env) print("env loaded") train_dir = os.path.join(arg.root_dir, 'network_weights') eval_dir = os.path.join(arg.root_dir, 'eval') train_env.reset() fc_layer_params = (arg.hidden_size_1, ) q_net = q_network.QNetwork(train_env.observation_spec(), train_env.action_spec(), fc_layer_params=fc_layer_params) optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=arg.lr) train_step_counter = tf.compat.v2.Variable(0) tf_agent = dqn_agent.DdqnAgent( train_env.time_step_spec(), train_env.action_spec(), q_network=q_net, optimizer=optimizer, epsilon_greedy=arg.eps_start, td_errors_loss_fn=dqn_agent.element_wise_squared_loss, train_step_counter=train_step_counter) train_metrics = [ tf_metrics.NumberOfEpisodes(), tf_metrics.EnvironmentSteps(), tf_metrics.AverageReturnMetric(), tf_metrics.AverageEpisodeLengthMetric(), ] global_step = tf.compat.v1.train.get_or_create_global_step() train_checkpointer = common.Checkpointer(ckpt_dir=train_dir, agent=tf_agent, global_step=global_step, metrics=metric_utils.MetricsGroup( train_metrics, 'train_metrics')) if arg.continue_training == False: tf_agent.initialize() if os.path.exists("network_weights/*"): os.remove("network_weights/*") else: print("Continue Training") train_checkpointer.initialize_or_restore() print("ready to go") eval_policy = tf_agent.policy collect_policy = tf_agent.collect_policy random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(), train_env.action_spec()) replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=tf_agent.collect_data_spec, batch_size=train_env.batch_size, max_length=arg.buffer_size) tf_agent.collect_data_spec tf_agent.collect_data_spec._fields cv2.namedWindow("display", cv2.WINDOW_NORMAL) collect_data(train_env, random_policy, replay_buffer, tf_agent, steps=arg.learn_start, max_t=40) print("create dataset") dataset = replay_buffer.as_dataset(num_parallel_calls=3, sample_batch_size=arg.batch_size, num_steps=2).prefetch(3) iterator = iter(dataset) # (Optional) Optimize by wrapping some of the code in a graph using TF function. tf_agent.train = common.function(tf_agent.train) # Reset the train step tf_agent.train_step_counter.assign(0) avg_return = compute_avg_return(eval_env, tf_agent.policy, arg.num_eval_episodes) returns = [avg_return] returns_average = [avg_return] train_loss_average = [1] score = 0 scores_window = deque(maxlen=100) # last 100 scores total_train_loss = deque(maxlen=100) # last 100 scores train(arg, tf_agent, train_env, eval_env, replay_buffer, iterator, train_checkpointer)
def train_level(level, consecutive_wins_flag=5, collect_random_steps=True, max_iterations=num_iterations): """ create DQN agent to train a level of the game :param level: level of the game :param consecutive_wins_flag: number of consecutive wins in evaluation signifying the training is done :param collect_random_steps: whether to collect random steps at the beginning, always set to 'True' when the global step is 0. :param max_iterations: stop the training when it reaches the max iteration regardless of the result """ global saving_time cells = query_level(level) size = len(cells) env = tf_py_environment.TFPyEnvironment(GameEnv(size, cells)) eval_env = tf_py_environment.TFPyEnvironment(GameEnv(size, cells)) optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate) fc_layer_params = (neuron_num_mapper[size], ) q_net = q_network.QNetwork(env.observation_spec()[0], env.action_spec(), fc_layer_params=fc_layer_params, activation_fn=tf.keras.activations.relu) global_step = tf.compat.v1.train.get_or_create_global_step() agent = dqn_agent.DdqnAgent( env.time_step_spec(), env.action_spec(), q_network=q_net, optimizer=optimizer, td_errors_loss_fn=common.element_wise_squared_loss, train_step_counter=global_step, observation_and_action_constraint_splitter=GameEnv. obs_and_mask_splitter) agent.initialize() replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=agent.collect_data_spec, batch_size=env.batch_size, max_length=replay_buffer_max_length) # drivers collect_driver = dynamic_step_driver.DynamicStepDriver( env, policy=agent.collect_policy, observers=[replay_buffer.add_batch], num_steps=collect_steps_per_iteration) eval_metrics = [ tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes), tf_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes), ] eval_driver = dynamic_episode_driver.DynamicEpisodeDriver( eval_env, policy=agent.policy, observers=eval_metrics, num_episodes=num_eval_episodes) # checkpointer of the replay buffer and policy train_checkpointer = common.Checkpointer(ckpt_dir=os.path.join( dir_path, 'trained_policies/train_lv{0}'.format(level)), max_to_keep=1, agent=agent, policy=agent.policy, global_step=global_step, replay_buffer=replay_buffer) # policy saver tf_policy_saver = policy_saver.PolicySaver(agent.policy) train_checkpointer.initialize_or_restore() # optimize by wrapping some of the code in a graph using TF function agent.train = common.function(agent.train) collect_driver.run = common.function(collect_driver.run) eval_driver.run = common.function(eval_driver.run) # collect initial replay data if collect_random_steps: initial_collect_policy = random_tf_policy.RandomTFPolicy( time_step_spec=env.time_step_spec(), action_spec=env.action_spec(), observation_and_action_constraint_splitter=GameEnv. obs_and_mask_splitter) dynamic_step_driver.DynamicStepDriver( env, initial_collect_policy, observers=[replay_buffer.add_batch], num_steps=initial_collect_steps).run() # Dataset generates trajectories with shape [Bx2x...] dataset = replay_buffer.as_dataset(num_parallel_calls=3, sample_batch_size=batch_size, num_steps=2).prefetch(3) iterator = iter(dataset) # train the model until 5 consecutive evaluation have reward greater than 100 consecutive_eval_win = 0 train_iterations = 0 while consecutive_eval_win < consecutive_wins_flag and train_iterations < max_iterations: collect_driver.run() for _ in range(collect_steps_per_iteration): experience, _ = next(iterator) train_loss = agent.train(experience).loss # evaluate the training at intervals step = global_step.numpy() if step % eval_interval == 0: eval_driver.run() average_return = eval_metrics[0].result().numpy() average_len = eval_metrics[1].result().numpy() print("level: {0} step: {1} AverageReturn: {2} AverageLen: {3}". format(level, step, average_return, average_len)) # evaluate consecutive wins if average_return > 10: consecutive_eval_win += 1 else: consecutive_eval_win = 0 if step % save_interval == 0: start = time.time() train_checkpointer.save(global_step=step) saving_time += time.time() - start train_iterations += 1 # save the policy train_checkpointer.save(global_step=global_step.numpy()) tf_policy_saver.save( os.path.join(dir_path, 'trained_policies/policy_lv{0}'.format(level)))
def create_agent( agent_class, environment, fc_layer_params, learning_rate, decaying_epsilon, n_step_update, target_update_tau, target_update_period, gamma, reward_scale_factor, gradient_clipping, debug_summaries, summarize_grads_and_vars, train_step_counter, num_atoms=None, # Only for categorical_dqn min_q_value=None, # Only for categorical_dqn max_q_value=None, # Only for categorical_dqn ): """Creates the Hanabi agent. Args: agent_class: str, type of agent to construct. environment: The environment. learning_rate: The Learning Rate decaying_epsilon: Epsilon for Epsilon Greedy Policy target_update_tau: Agent parameter target_update_period: Agent parameter gamma: Agent parameter reward_scale_factor: Agent parameter gradient_clipping: Agent parameter debug_summaries: Agent parameter summarize_grads_and_vars: Agent parameter train_step_counter: The train step tf.Variable to be passed to agent Returns: An agent for playing Hanabi. Raises: ValueError: if an unknown agent type is requested. """ if agent_class == 'DQN': return dqn_agent.DqnAgent( environment.time_step_spec(), environment.action_spec(), q_network=q_network.QNetwork( environment.time_step_spec().observation['observations'], environment.action_spec(), fc_layer_params=fc_layer_params), optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), observation_and_action_constraint_splitter= observation_and_action_constraint_splitter, epsilon_greedy=decaying_epsilon, n_step_update=n_step_update, target_update_tau=target_update_tau, target_update_period=target_update_period, td_errors_loss_fn=common.element_wise_squared_loss, gamma=gamma, reward_scale_factor=reward_scale_factor, gradient_clipping=gradient_clipping, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter) elif agent_class == 'DDQN': return dqn_agent.DdqnAgent( environment.time_step_spec(), environment.action_spec(), q_network=q_network.QNetwork( environment.time_step_spec().observation['observations'], environment.action_spec(), fc_layer_params=fc_layer_params), optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), observation_and_action_constraint_splitter= observation_and_action_constraint_splitter, epsilon_greedy=decaying_epsilon, n_step_update=n_step_update, target_update_tau=target_update_tau, target_update_period=target_update_period, td_errors_loss_fn=common.element_wise_squared_loss, gamma=gamma, reward_scale_factor=reward_scale_factor, gradient_clipping=gradient_clipping, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter) elif agent_class == 'categorical_dqn': return categorical_dqn_agent.CategoricalDqnAgent( environment.time_step_spec(), environment.action_spec(), categorical_q_network=categorical_q_network.CategoricalQNetwork( environment.time_step_spec().observation['observations'], environment.action_spec(), num_atoms=num_atoms, fc_layer_params=fc_layer_params), optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), observation_and_action_constraint_splitter= observation_and_action_constraint_splitter, epsilon_greedy=decaying_epsilon, n_step_update=n_step_update, target_update_tau=target_update_tau, target_update_period=target_update_period, min_q_value=min_q_value, max_q_value=max_q_value, td_errors_loss_fn=common.element_wise_squared_loss, gamma=gamma, reward_scale_factor=reward_scale_factor, gradient_clipping=gradient_clipping, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter) else: raise ValueError( 'Expected valid agent_type, got {}'.format(agent_class))
def main(): parser = argparse.ArgumentParser() ## Essential parameters parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model stats and checkpoints will be written." ) parser.add_argument("--env", default=None, type=str, required=True, help="The environment to train the agent on") parser.add_argument("--approx_env_boundaries", default=False, type=bool, help="Whether to get the env boundaries approximately") parser.add_argument("--max_horizon", default=4, type=int) parser.add_argument("--atari", default=True, type=bool, help="Gets some data Types correctly") ##agent parameters parser.add_argument("--reward_scale_factor", default=1.0, type=float) parser.add_argument("--debug_summaries", default=False, type=bool) parser.add_argument("--summarize_grads_and_vars", default=False, type=bool) ##transformer parameters parser.add_argument("--d_model", default=64, type=int) parser.add_argument("--num_layers", default=2, type=int) parser.add_argument("--dff", default=256, type=int) ##Training parameters parser.add_argument('--num_iterations', type=int, default=2000000, help="steps in the env") parser.add_argument('--num_iparallel', type=int, default=1, help="how many envs should run in parallel") parser.add_argument("--collect_steps_per_iteration", default=4, type=int) parser.add_argument("--train_steps_per_iteration", default=1, type=int) ## Other parameters parser.add_argument("--num_eval_episodes", default=10, type=int) parser.add_argument("--eval_interval", default=10000, type=int) parser.add_argument("--log_interval", default=10000, type=int) parser.add_argument("--summary_interval", default=10000, type=int) parser.add_argument("--run_graph_mode", default=True, type=bool) parser.add_argument("--checkpoint_interval", default=100000, type=int) parser.add_argument("--summary_flush", default=10, type=int) #what does this exactly do? # HP opt params parser.add_argument("--doubleQ", default=True, type=bool, help="Whether to use a DoubleQ agent") parser.add_argument("--custom_last_layer", default=True, type=bool) parser.add_argument("--custom_layer_init", default=1.0, type=float) parser.add_argument("--initial_collect_steps", default=50000, type=int) parser.add_argument("--loss_function", default="element_wise_huber_loss", type=str) parser.add_argument("--num_heads", default=4, type=int) parser.add_argument("--normalize_env", default=False, type=bool) parser.add_argument('--custom_lr_schedule', default="No", type=str, help="whether to use a custom LR schedule") parser.add_argument("--epsilon_greedy", default=0.3, type=float) parser.add_argument("--target_update_period", default=10000, type=int) parser.add_argument( "--rate", default=0.1, type=float ) # dropout rate (might be not used depending on the q network) #Setting this to 0.0 somehow break the code. Not relevant tho just select a network without dropout parser.add_argument("--gradient_clipping", default=True, type=bool) parser.add_argument("--replay_buffer_max_length", default=1000000, type=int) parser.add_argument("--batch_size", default=32, type=int) parser.add_argument("--learning_rate", default=1e-4, type=float) parser.add_argument("--encoder_type", default=3, type=int, help="Which Type of encoder is used for the model") parser.add_argument("--layer_type", default=3, type=int, help="Which Type of layer is used for the encoder") parser.add_argument("--target_update_tau", default=1, type=float) parser.add_argument("--gamma", default=0.99, type=float) args = parser.parse_args() # List of encoder modules which we can use to change encoder based on a variable global_step = tf.compat.v1.train.get_or_create_global_step() baseEnv = gym.make(args.env) env = suite_gym.load(args.env, gym_kwargs={"frameskip": 4}) eval_env = suite_gym.load(args.env, gym_kwargs={"frameskip": 4}) if args.normalize_env == True: env = NormalizeWrapper(env, args.approx_env_boundaries, args.env) eval_env = NormalizeWrapper(eval_env, args.approx_env_boundaries, args.env) env = PyhistoryWrapper(env, args.max_horizon, args.atari) eval_env = PyhistoryWrapper(eval_env, args.max_horizon, args.atari) tf_env = tf_py_environment.TFPyEnvironment(env) eval_tf_env = tf_py_environment.TFPyEnvironment(eval_env) q_net = QTransformer(tf_env.observation_spec(), baseEnv.action_space.n, num_layers=args.num_layers, d_model=args.d_model, num_heads=args.num_heads, dff=args.dff, rate=args.rate, encoderType=args.encoder_type, enc_layer_type=args.layer_type, max_horizon=args.max_horizon, custom_layer=args.custom_layer_init, custom_last_layer=args.custom_last_layer) if args.custom_lr_schedule == "Transformer": # builds a lr schedule according to the original usage for the transformer learning_rate = CustomSchedule(args.d_model, int(args.num_iterations / 10)) optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) elif args.custom_lr_schedule == "Transformer_low": # builds a lr schedule according to the original usage for the transformer learning_rate = CustomSchedule( int(args.d_model / 2), int(args.num_iterations / 10)) # --> same schedule with lower general lr optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) elif args.custom_lr_schedule == "Linear": lrs = LinearCustomSchedule(learning_rate, args.num_iterations) optimizer = tf.keras.optimizers.Adam(lrs, beta_1=0.9, beta_2=0.98, epsilon=1e-9) else: optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=args.learning_rate) if args.loss_function == "element_wise_huber_loss": lf = element_wise_huber_loss elif args.loss_function == "element_wise_squared_loss": lf = element_wise_squared_loss if args.doubleQ == False: # global step count agent = dqn_agent.DqnAgent( tf_env.time_step_spec(), tf_env.action_spec(), q_network=q_net, epsilon_greedy=args.epsilon_greedy, target_update_tau=args.target_update_tau, target_update_period=args.target_update_period, td_errors_loss_fn=lf, optimizer=optimizer, gamma=args.gamma, reward_scale_factor=args.reward_scale_factor, gradient_clipping=args.gradient_clipping, debug_summaries=args.debug_summaries, summarize_grads_and_vars=args.summarize_grads_and_vars, train_step_counter=global_step) else: agent = dqn_agent.DdqnAgent( tf_env.time_step_spec(), tf_env.action_spec(), q_network=q_net, epsilon_greedy=args.epsilon_greedy, target_update_tau=args.target_update_tau, td_errors_loss_fn=lf, target_update_period=args.target_update_period, optimizer=optimizer, gamma=args.gamma, reward_scale_factor=args.reward_scale_factor, gradient_clipping=args.gradient_clipping, debug_summaries=args.debug_summaries, summarize_grads_and_vars=args.summarize_grads_and_vars, train_step_counter=global_step) agent.initialize() count_weights(q_net) train_eval(root_dir=args.output_dir, tf_env=tf_env, eval_tf_env=eval_tf_env, agent=agent, num_iterations=args.num_iterations, initial_collect_steps=args.initial_collect_steps, collect_steps_per_iteration=args.collect_steps_per_iteration, replay_buffer_capacity=args.replay_buffer_max_length, train_steps_per_iteration=args.train_steps_per_iteration, batch_size=args.batch_size, use_tf_functions=args.run_graph_mode, num_eval_episodes=args.num_eval_episodes, eval_interval=args.eval_interval, train_checkpoint_interval=args.checkpoint_interval, policy_checkpoint_interval=args.checkpoint_interval, rb_checkpoint_interval=args.checkpoint_interval, log_interval=args.log_interval, summary_interval=args.summary_interval, summaries_flush_secs=args.summary_flush) pickle.dump(args, open(args.output_dir + "/training_args.p", "wb")) print("Successfully trained and evaluation.")
def main(): retro.data.Integrations.add_custom_path( os.path.join(SCRIPT_DIR, "custom_integrations") ) print("PokemonRed-GameBoy" in retro.data.list_games(inttype=retro.data.Integrations.ALL)) env = retro.make("PokemonRed-GameBoy", inttype=retro.data.Integrations.ALL) print(env) # tf_env = tf_py_environment.TFPyEnvironment(env) # printCounter = 0 # if printCounter % 10000: # print("reward: ", rew) # printCounter += 1 obs = env.reset() #get start time start_time = time.time() network = q_network.QNetwork(env.observation_spec(), env.action_spec(), fc_layer_params = (100,)) #initialize the agent with the listed parameters agent = dqn_agent.DdqnAgent(env.time_step_spec(), env.action_spec(), q_network = network, optimizer = tf.optimizers.Adam(learning_rate = learning_rate), td_errors_loss_fn = common.element_wise_squared_loss, train_step_counter = tf.Variable(0), epsilon_greedy = epsilon, gamma = gamma, name = name) replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(agent.collect_data_spec, batch_size = env.batch_size, max_length = replay_buffer_capacity) #add an observer to add to the buffer replay_observer = [replay_buffer.add_batch] #create step driver #collect_op = dynamic_step_driver.DynamicStepDriver(env, agent.collect_policy, observers = replay_observer, num_steps = 10).run() #create random policy to help generate dataset random_policy = random_tf_policy.RandomTFPolicy(env.time_step_spec(), env.action_spec()) #populate replay buffer collect_data(env, random_policy, replay_buffer, initial_collect_steps) #generate trajectories; num steps = 2 so that it views current and next observation in dataset dataset = replay_buffer.as_dataset(num_parallel_calls = 3, sample_batch_size = batch_size, num_steps = 2).prefetch(3) #create iterator for dataset to feed the agent iterator = iter(dataset) #print(iterator) #wrap in a graph for TF optimization agent.train = common.function(agent.train) agent.train_step_counter.assign(0) #reset #Evaluate the initialized policy prior to training for baseline avg = avg_return(env, agent.policy, num_eval_episodes) returns = [avg] #holds average returns from multiple points during training #main training loop for i in range(num_iterations): env.render() collect_data(env, agent.collect_policy, replay_buffer, collect_steps_per_iteration) #sample and update network exp, _ = next(iterator) loss = agent.train(exp).loss #get step step = agent.train_step_counter.numpy() #log progress or evaluate policy if needed (depending on hyperparameters) if(step % log_interval == 0): print('step = {0}: loss = {1}'.format(step, avg)) if(step % eval_interval == 0): avg = avg_return(env, agent.policy, num_eval_episodes) print('step = {0}: Average retrun = {1}'.format(step, avg)) returns.append(avg) #produce graph of training process iterations = range(0, num_iterations+1, eval_interval) plt.plot(iterations, returns) plt.ylabel('Average Return') plt.xlabel('Iteration') plt.title('Average Return Over Time') plt.show()
optimizer = tf.keras.optimizers.RMSprop(lr=2.5e-4, rho=0.95, momentum=0.0, epsilon=0.00001, centered=True) epsilon_fn = tf.keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=1.0, decay_steps=250000 // update_period, end_learning_rate=0.01) agent = dqn_agent.DdqnAgent( train_tf_env.time_step_spec(), train_tf_env.action_spec(), q_network=q_net, optimizer=optimizer, target_update_period=2000, td_errors_loss_fn=tf.keras.losses.Huber(reduction="none"), gamma=0.99, train_step_counter=train_step, epsilon_greedy=lambda: epsilon_fn(train_step)) agent.initialize() replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=agent.collect_data_spec, batch_size=train_tf_env.batch_size, max_length=1000000) replay_buffer_observer = replay_buffer.add_batch train_metrics = [
fc_layer_params=fc_layer_params) optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate, epsilon=adam_epsilon) train_step_counter = tf.Variable(0) epsilon = tf.compat.v1.train.polynomial_decay(start_epsilon, train_step_counter, num_iterations, end_learning_rate=end_epsilon) agent = dqn_agent.DdqnAgent(train_game_env.time_step_spec(), train_game_env.action_spec(), q_network=q_net, epsilon_greedy=epsilon, optimizer=optimizer, target_update_period=target_update_period, td_errors_loss_fn=common.element_wise_squared_loss, train_step_counter=train_step_counter) agent.initialize() # DEFINE METRICS ETC. (see tf-agents DQN tutorial) def compute_avg_return(environment, policy, num_episodes=10): total_return = 0.0 for i in range(num_episodes): time_step = environment.reset()