def train(self, exp_schedule, lr_schedule): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action Instance of LinearExploration lr_schedule: Schedule for learning rate Instance of LinearExploration """ # initialize replay buffer and variables replay_buffer = ReplayBuffer(self.config.buffer_size, self.board_size) rewards = deque(maxlen=self.config.num_episodes_test) max_p_values = deque(maxlen=1000) p_values = deque(maxlen=1000) self.init_averages() t = last_eval = last_record = 0 # time control of nb of steps episode = last_checkpoint = 0 opponents = [] # opponents used to play against in training # load model from checkpiont if necessary if not self.config.checkpoint == -1: self.saver.restore(self.sess, self.config.load_checkpoint_file) opponents += [self.generate_opponent(t)] t = self.config.checkpoint scores_eval = [self.evaluate(t)[0] ] # list of scores computed at iteration time prog = Progbar(target=self.config.nsteps_train) # files for writing stuff train_game_length_f = open("train_game_lengths.txt", 'w', buffering=1) eval_game_length_f = open("eval_game_lengths.txt", 'w', buffering=1) # per episode training loop while t < self.config.nsteps_train: # variables for this episode state = self.env.reset() states = [] actions = [] training_agent_is_black = random.choice( [True, False] ) # if the agent being trained is playing as black or not (playing first) episode += 1 last_checkpoint += 1 # Add opponent to pool if it's time to if t == 0 or last_checkpoint > self.config.checkpoint_freq: opponents += [self.generate_opponent(t)] last_checkpoint = 0 # randomly sample an opponent for this episode self.opponent_out = random.sample(opponents, 1)[0] # If our agent should play as white, let the oponent make a move! # We know that the game won't end in one move, so don't worry about that! if not training_agent_is_black: # Let the opponent make a move player = self.env.state.color player_perspective_board = self._board_from_player_perspective( state, player) best_action, _, _ = self.get_best_valid_action( player_perspective_board) state, _, _, _ = self.env.step(best_action) # per action training loop while True: # increment counters t += 1 last_eval += 1 last_record += 1 # render? if self.config.render_train: print("Board before agent moves:") self.env.render() # Who's turn is it? player = self.env.state.color # chose action according to current state and exploration player_perspective_board = self._board_from_player_perspective( state, player) action, action_dist, valid_actions = self.sample_valid_action( player_perspective_board) action = exp_schedule.get_action(action, valid_actions) # store p values max_p_values.append(max(action_dist)) p_values += list(action_dist) # perform action in env, and remember if the player just managed to loose the game new_state, _, done, _ = self.env.step(action) training_agent_made_last_move = done # Render? if self.config.render_train: print("Board after agent moves:") self.env.render() # Store the s, a, for later use in replay buffer states.append( self._board_from_player_perspective(state, player)) actions.append(action) # if the game hasn't ended, let the opponent move if not done: player = self.env.state.color player_perspective_board = self._board_from_player_perspective( new_state, player) best_action, _, _ = self.get_opponent_best_valid_action( player_perspective_board) new_state, _, done, _ = self.env.step(best_action) # now if we're done, keep track of some data (compute reward + write game length to file) # manually compute who (should have) won using the game state # Manually compute the reward, it's non-zero only if the games finished # the open AI env's reward is unreliable because of invalid moves (the person winning 'resigns') # we just take the sign of the 'official score', which is positive iff white is winning # and adjust it to whoever 'we' are playing as reward = 0.0 if done: reward = np.sign(self.env.state.board.official_score) if training_agent_is_black: reward *= -1.0 # store the transition state = new_state # now that we know the true reward (after opponent taking action) we can update it rewards.append(reward) # perform a training step loss_eval, grad_eval = self.train_step(t, replay_buffer, lr_schedule.epsilon) # Update schedules exp_schedule.update(t) lr_schedule.update(t) # logging stuff if ((t > self.config.learning_start) and (t % self.config.log_freq == 0)): self.update_averages(rewards, max_p_values, p_values, scores_eval) if len(rewards) > 0: prog.update(t + 1, exact=[("Loss", loss_eval), ("Avg R", self.avg_reward), ("Max R", np.max(rewards)), ("eps", exp_schedule.epsilon), ("Grads", grad_eval), ("Max P", self.max_p), ("lr", lr_schedule.epsilon)]) elif (t < self.config.learning_start) and ( t % self.config.log_freq == 0): sys.stdout.write("\rPopulating the memory {}/{}...".format( t, self.config.learning_start)) sys.stdout.flush() # If finished, print stuff out, and add to replay buffer if done: # Logging (for some graphs) game_length = len(states) train_game_length_f.write(str(game_length) + '\n') # Compute the values (discounted sum of rewards) for this game backpropogated_rewards = np.array([reward] * len(states)) discounts = np.array( list( reversed([ self.config.gamma**i for i in range(len(states)) ]))) discounted_values = backpropogated_rewards * discounts # If the training agent lost the game, we want to make sure that their # LOOSING move has a negative value... if training_agent_made_last_move and discounted_values[ -1] > 0: discounted_values[-1] *= -1.0 # Put stuff in the replay buffer replay_buffer.store_example_batch(states, actions, discounted_values) # Break from the step training loop break # If it's time to eval, then evaluate if (t > self.config.learning_start) and (last_eval > self.config.eval_freq): last_eval = 0 print("") eval_avg_reward, eval_avg_length = self.evaluate(t) scores_eval += [eval_avg_reward] eval_game_length_f.write(str(eval_avg_length) + '\n') # last words self.logger.info("- Training done.") self.save() eval_avg_reward, eval_avg_length = self.evaluate(t) scores_eval += [eval_avg_reward] eval_game_length_f.write(str(eval_avg_length) + '\n')
def train(sess, env, Qnet, global_step): # Set up summary Ops summary_ops, summary_vars = build_summaries() sess.run(tf.global_variables_initializer()) # load model if have saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(SUMMARY_DIR) if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) print("global step: ", global_step.eval()) else: print("Could not find old network weights") writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph) # Initialize target network weights Qnet.update_target_network() count_parameters() # Initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) i = global_step.eval() eval_acc_reward = 0 tic = time.time() eps = 1 while True: i += 1 eps = EPS_DECAY_RATE**i eps = max(eps, EPS_MIN) s = env.reset() # plt.imshow(s, interpolation='none') # plt.show() # s = prepro(s) ep_ave_max_q = 0 if i % SAVE_STEP == 0: # save check point every 1000 episode sess.run(global_step.assign(i)) save_path = saver.save(sess, SUMMARY_DIR + "model.ckpt", global_step=global_step) print("Model saved in file: %s" % save_path) print("Successfully saved global step: ", global_step.eval()) for j in xrange(MAX_EP_STEPS): predicted_q_value = Qnet.predict( np.reshape(s, np.hstack((1, Qnet.s_dim)))) predicted_q_value = predicted_q_value[0] np.random.seed() action = np.argmax(predicted_q_value) if np.random.rand() < eps: action = np.random.randint(4) # print('eps') # print'actionprob:', action_prob # print(action) # print(a) s2, r, terminal, info = env.step(action) # print r, info # plt.imshow(s2, interpolation='none') # plt.show() # s2 = prepro(s2) # print(np.reshape(s, (actor.s_dim,)).shape) action_vector = action_ecoder(action, Qnet.a_dim) replay_buffer.add(np.reshape(s, (Qnet.s_dim)), np.reshape(action_vector, (Qnet.a_dim)), r, \ terminal, np.reshape(s2, (Qnet.s_dim))) s = s2 eval_acc_reward += r if terminal: # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MINIBATCH_SIZE: s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(MINIBATCH_SIZE) # Calculate targets target_q = Qnet.predict_target(s2_batch) y_i = [] for k in xrange(MINIBATCH_SIZE): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + GAMMA * np.max(target_q[k])) # # Update the Qnet given the target predicted_q_value, _ = Qnet.train(s_batch, a_batch, y_i) ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient # Update target networks every 1000 iter # if i%TARGET_UPDATE_STEP == 0: Qnet.update_target_network() if i % EVAL_EPISODES == 0: # summary time_gap = time.time() - tic summary_str = sess.run( summary_ops, feed_dict={ summary_vars[0]: (eval_acc_reward + EVAL_EPISODES) / 2, summary_vars[1]: ep_ave_max_q / float(j + 1), }) writer.add_summary(summary_str, i) writer.flush() print ('| Success: %i %%' % ((eval_acc_reward+EVAL_EPISODES)/2), "| Episode", i, \ '| Qmax: %.4f' % (ep_ave_max_q / float(j+1)), ' | Time: %.2f' %(time_gap), ' | Eps: %.2f' %(eps)) tic = time.time() # print(' 100 round reward: ', eval_acc_reward) eval_acc_reward = 0 break
def train(sess, env, env_test, args, agent): # Set up summary Ops summary_ops, summary_vars = build_summaries() episode_R = [] sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(args['summary_dir'], sess.graph) # Initialize target network weights agent.update_actor_target_network() agent.update_critic_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(int(args['buffer_size']), int(args['random_seed'])) total_step_cnt = 0 test_iter = 0 epi_cnt = 0 return_test = np.zeros( (np.ceil(int(args['total_step_num']) / int(args['sample_step_num'])).astype('int') + 1)) result_name = 'TD3_' + args['env'] + '_trial_idx_' + str( int(args['trial_idx'])) action_noise = float(args['action_noise']) trained_times_steps = 0 save_cnt = 1 policy_ite = 0 #for i in range(int(args['max_episodes'])): while total_step_cnt in range(int(args['total_step_num'])): state = env.reset() ep_reward = 0 ep_ave_max_q = 0 T_end = False for j in range(int(args['max_episode_len'])): if args['render_env']: env.render() # Added exploration noise if total_step_cnt < 1e4: action = env.action_space.sample() else: action = agent.predict_actor( np.reshape(state, (1, agent.state_dim))) #+ actor_noise() clipped_noise = np.clip( np.random.normal(0, action_noise, size=env.action_space.shape[0]), -0.5, 0.5) action = (action + clipped_noise).clip(env.action_space.low, env.action_space.high) state2, reward, terminal, info = env.step(action[0]) replay_buffer.add(np.reshape(state, (agent.state_dim, )), np.reshape(action, (agent.action_dim, )), reward, terminal, np.reshape(state2, (agent.state_dim, ))) if j == int(args['max_episode_len']) - 1: T_end = True state = state2 ep_reward += reward total_step_cnt += 1 if total_step_cnt >= test_iter * int( args['sample_step_num']) or total_step_cnt == 1: print('total_step_cnt', total_step_cnt) print('evaluating the deterministic policy...') for nn in range(int(args['test_num'])): state_test = env_test.reset() return_epi_test = 0 for t_test in range(int(args['max_episode_len'])): action_test = agent.predict_actor( np.reshape(state_test, (1, agent.state_dim))) state_test2, reward_test, terminal_test, info_test = env_test.step( action_test[0]) state_test = state_test2 return_epi_test = return_epi_test + reward_test if terminal_test: break print('test_iter:{:d}, nn:{:d}, return_epi_test: {:d}'. format(int(test_iter), int(nn), int(return_epi_test))) return_test[test_iter] = return_test[ test_iter] + return_epi_test / float(args['test_num']) print('return_test[{:d}] {:d}'.format( int(test_iter), int(return_test[test_iter]))) test_iter += 1 if total_step_cnt > int(args['save_model_num']) * save_cnt: model_path = "./Model/" try: import pathlib pathlib.Path(model_path).mkdir(parents=True, exist_ok=True) except: print( "A model directory does not exist and cannot be created. The policy models are not saved" ) agent.save_model(iteration=test_iter, expname=result_name, model_path=model_path) save_cnt += 1 if terminal or T_end: epi_cnt += 1 print( '| Reward: {:d} | Episode: {:d} | Total step num: {:d} |'. format(int(ep_reward), epi_cnt, total_step_cnt)) # episode_R.append(ep_reward) break if total_step_cnt != int( args['total_step_num']) and total_step_cnt > 1e3: update_num = total_step_cnt - trained_times_steps trained_times_steps = total_step_cnt print('update_num', update_num) update_policy(sess, env, env_test, args, agent, replay_buffer, action_noise, update_num) policy_ite += 1 return return_test
def main_loop(handle, possible_actions: list, model: Model, target_model: Model): exp_schedule = ExplorationScheduler() target_model.load_state_dict(model.state_dict()) optimizer = torch.optim.RMSprop(model.parameters()) with mss() as sct: counter = 0 frame_counter = 0 frame_skip_counter = 0 score = 0 lives = 3 frame_times = [0, 0, 0, 0] replay_buffer = ReplayBuffer( REPLAY_BUFFER_SIZE, (3 * FRAMES_FEED, RESIZE_HEIGHT, RESIZE_WIDTH), FRAMES_FEED, baseline_priority=1, gamma=GAMMA, reward_steps=N_STEP_REWARD) t = 0 action = 0 while True: if not active: time.sleep( 0.5 ) # Wait some time and check if recording should be resumed. continue startMillis = time.time() # Time # Grab frames frame, frame_cv2 = grab_screen(monitor, sct) # Show frame if DEBUG: cv2.imshow('window1', frame_cv2) # Check if frame will be skipped. Not skipped if counter is 0 if frame_skip_counter == 0: reward, score, lives = get_reward(handle, lives, score) # print(action, reward) if replay_buffer.waiting_for_effect: replay_buffer.add_effects(action, reward) replay_buffer.push_frame(frame) if replay_buffer.buffer_init( ) and np.random.random() > exp_schedule.value(t): action = choose_action(replay_buffer.encode_last_frame(), model) else: action = np.random.randint(0, len(possible_actions)) execute_actions([possible_actions[int(action)] ]), # dk.SCANCODES["z"] # Logic to deal with a ready datapoint if replay_buffer.can_sample( BATCH_SIZE) and t % TRAIN_FREQ == 0: if PAUSE_ON_TRAIN: pause_game() for _ in range(BATCHES_PER_TRAIN): optimize_model(model, target_model, replay_buffer, optimizer, num_actions=len(possible_actions)) if PAUSE_ON_TRAIN: pause_game() # Copy model weights to target if t % TARGET_MODEL_UPDATE_FREQ == 0: print("Saving model") state_dict = model.state_dict() torch.save(state_dict, MODEL_PATH) print("done pickling") target_model.load_state_dict(state_dict) target_model.eval() frame_skip_counter += 1 frame_skip_counter = frame_skip_counter % FRAMES_SKIP # Frame timings and other utility endMillis = time.time() frame_time = endMillis - startMillis frame_times[counter % 4] = frame_time t += 1 # if counter % 4 == 0: # print("frame time: %s" % (np.mean(frame_times))) counter += 1 if cv2.waitKey(25) & 0xFF == ord('q'): cv2.destroyAllWindows() break
def train(sess, env, actor, global_step): # Set up summary Ops summary_ops, summary_vars = build_summaries() sess.run(tf.global_variables_initializer()) # load model if have saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(SUMMARY_DIR) if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print ("Successfully loaded:", checkpoint.model_checkpoint_path) print("global step: ", global_step.eval()) else: print ("Could not find old network weights") writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph) # Initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) i = global_step.eval() eval_acc_reward = 0 tic = time.time() eps = 1 while True: i += 1 s = env.reset() ep_ave_max_q = 0 eps *= EPS_DECAY_RATE eps = max(eps, EPS_MIN) episode_s, episode_acts, episode_rewards = [], [], [] if i % SAVE_STEP == 0 : # save check point every 1000 episode sess.run(global_step.assign(i)) save_path = saver.save(sess, SUMMARY_DIR + "model.ckpt" , global_step = global_step) print("Model saved in file: %s" % save_path) print("Successfully saved global step: ", global_step.eval()) for j in xrange(MAX_EP_STEPS): # print(s.shape) # Added exploration noise action = actor.predict(np.reshape(s, np.hstack((1, actor.s_dim)))) # print action s2, r, terminal, info = env.step(action) # plt.imshow(s2, interpolation='none') # plt.show() episode_s.append(s) episode_acts.append(action) episode_rewards.append(r) s = s2 eval_acc_reward += r if terminal: # stack together all inputs, hidden states, action gradients, and rewards for this episode episode_rewards = np.asarray(episode_rewards) # print('episode_rewards', episode_rewards) episode_rewards = discount_rewards(episode_rewards) # print('after', episode_rewards) # update buffer for n in range(len(episode_rewards)): replay_buffer.add(np.reshape(episode_s[n], (actor.s_dim)), episode_acts[n], episode_rewards[n], terminal, np.reshape(episode_s[n], (actor.s_dim))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MINIBATCH_SIZE: s_batch, a_batch, r_batch, t_batch, _ = replay_buffer.sample_batch(MINIBATCH_SIZE) # Update the actor policy using the sampled gradient actor.train(s_batch, a_batch, r_batch) # print '| Reward: %.2i' % int(ep_reward), " | Episode", i, \ # '| Qmax: %.4f' % (ep_ave_max_q / float(j+1)) if i%EVAL_EPISODES == 0: # summary time_gap = time.time() - tic summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: (eval_acc_reward+EVAL_EPISODES)/2, }) writer.add_summary(summary_str, i) writer.flush() print ('| Success: %i %%' % ((eval_acc_reward+EVAL_EPISODES)/2), "| Episode", i, \ ' | Time: %.2f' %(time_gap), ' | Eps: %.2f' %(eps)) tic = time.time() # print(' 100 round reward: ', eval_acc_reward) eval_acc_reward = 0 break
def main(): env = envstandalone.TestRob3Env() max_timesteps = 40000 learning_starts = 1000 buffer_size = 50000 # buffer_size=1 exploration_fraction = 0.2 exploration_final_eps = 0.02 print_freq = 10 gamma = .98 # target_network_update_freq=500 target_network_update_freq = 1 learning_alpha = 0.2 batch_size = 32 train_freq = 1 obsShape = (8, 8, 1) # deicticShape = (3,3,1) deicticShape = (3, 3, 2) num_deictic_patches = 36 num_actions = 4 episode_rewards = [0.0] num_cpu = 16 num_cascade = 5 # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # CNN version # conv model parameters: (num_outputs, kernel_size, stride) model = models.cnn_to_mlp( convs=[(16, 3, 1)], # convs=[(16,2,1)], hiddens=[16], dueling=True) # MLP version # model = models.mlp([8, 16]) # model = models.mlp([16, 16]) # model = models.mlp([16, 32]) # model = models.mlp([16, 16]) # model = models.mlp([32, 32]) q_func = model lr = 0.001 def make_obs_ph(name): return U.BatchInput(obsShape, name=name) def make_obsDeic_ph(name): # CNN version return U.BatchInput(deicticShape, name=name) # # MLP version # return U.BatchInput([deicticShape[0]*deicticShape[1]*deicticShape[2]], name=name) def make_target_ph(name): # return U.BatchInput([num_actions], name=name) return U.BatchInput([num_cascade, num_actions], name=name) sess = U.make_session(num_cpu) sess.__enter__() getq = build_getq(make_obsDeic_ph=make_obsDeic_ph, q_func=q_func, num_actions=num_actions, num_cascade=num_cascade, scope="deepq", qscope="q_func") getqTarget = build_getq(make_obsDeic_ph=make_obsDeic_ph, q_func=q_func, num_actions=num_actions, num_cascade=num_cascade, scope="deepq", qscope="q_func_target") update_target = build_update_target(scope="deepq", qscope="q_func", qscopeTarget="q_func_target") targetTrain = build_targetTrain( make_obsDeic_ph=make_obsDeic_ph, make_target_ph=make_target_ph, q_func=q_func, num_actions=env.action_space.n, num_cascade=num_cascade, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func") getDeic = build_getDeic(make_obs_ph=make_obs_ph, deicticShape=deicticShape) # Initialize the parameters and copy them to the target network. U.initialize() update_target() replay_buffer = ReplayBuffer(buffer_size) obs = env.reset() timerStart = time.time() for t in range(max_timesteps): # obsDeictic = getDeicticObs(obs) obsDeictic = getDeic([obs]) # CNN version qCurr = getq(np.array(obsDeictic)) # # MLP version # qCurr = getq(np.reshape(obsDeictic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])) # select action qCurrNoise = qCurr + np.random.random(np.shape( qCurr)) * 0.01 # add small amount of noise to break ties randomly action = np.argmax(np.max(qCurrNoise[:, -1, :], 0)) selPatch = np.argmax(np.max(qCurrNoise[:, -1, :], 1)) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs, action, rew, new_obs, float(done)) # sample from replay buffer and train if t > learning_starts and t % train_freq == 0: # Sample from replay buffer obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) # Put observations in deictic form obses_t_deic = getDeic(obses_t) obses_tp1_deic = getDeic(obses_tp1) # Reshape everything to (1152,) form donesTiled = np.repeat(dones, num_deictic_patches) rewardsTiled = np.repeat(rewards, num_deictic_patches) actionsTiled = np.repeat(actions, num_deictic_patches) # Get curr, next values: CNN version qNext = getq(obses_tp1_deic) qCurr = getq(obses_t_deic) # # Get curr, next values: MLP version # qNext = getq(np.reshape(obses_tp1_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])) # qCurr = getq(np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])) # This version pairs a glimpse with the same glimpse on the next time step qNextmax = np.max(qNext[:, -1, :], 1) # # This version takes the max over all glimpses # qNextTiled = np.reshape(qNext[:,-1,:],[batch_size,num_deictic_patches,num_actions]) # qNextmax = np.repeat(np.max(np.max(qNextTiled,2),1),num_deictic_patches) # Compute Bellman estimate targets = rewardsTiled + (1 - donesTiled) * gamma * qNextmax # targetsTiled = np.tile(np.reshape(targets,[-1,1]),[1,num_cascade]) qCurrTargets = np.copy(qCurr) # # Copy into cascade without pruning # for i in range(num_cascade): # qCurrTargets[range(batch_size*num_deictic_patches),i,actionsTiled] = targets # Copy into cascade with pruning. qCurrTargets[range(batch_size * num_deictic_patches), 0, actionsTiled] = targets for i in range(num_cascade - 1): mask = targets < qCurrTargets[range(batch_size * num_deictic_patches), i, actionsTiled] qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] = \ mask*targets + \ (1-mask)*qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] # CNN version td_error_out, obses_deic_out, targets_out = targetTrain( obses_t_deic, qCurrTargets) # # MLP version # td_error_out, obses_deic_out, targets_out = targetTrain( # np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]), # qCurrTargets # ) # Update target network periodically. if t > learning_starts and t % target_network_update_freq == 0: update_target() # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs
from networks import ActorNetwork, CriticNetwork from replay_buffer import ReplayBuffer MINIBATCH_SIZE = 64 GAMMA = 0.99 if __name__ == '__main__': env = gym.make('Pendulum-v0') max_steps = env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps') state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] action_bound = env.action_space.high[0] actor = ActorNetwork(state_dim, action_dim, action_bound) critic = CriticNetwork(state_dim, action_dim) replay_buffer = ReplayBuffer(10000) total = 0 for episode in range(1000): obs0 = env.reset() ep_reward = 0 for t in range(max_steps): if episode % 25 == 0: env.render() action = actor.act(obs0) # TODO add noise for exploration obs1, reward, done, info = env.step(action) replay_buffer.add(obs0.reshape(state_dim), action.reshape(action_dim), reward, t, obs1.reshape(state_dim)) if replay_buffer.size() > MINIBATCH_SIZE:
from copy import deepcopy from random import random device = torch.device("cuda" if torch.cuda.is_available() else "cpu") algo_name = 'DDQN' # Used for visualization max_episodes = 2000 # after 40 episodes, SAC flatlines around [-200, -100] reward max_steps = 1000 # auto-terminate episode after win gamma = 0.99 # Discount α = 0.1 # Entropy temperature -- relative importance vs. rewards lr = 3e-4 # Determines how big of a gradient step to take when optimizing tau = 0.995 # Target smoothing coefficient --> how much of the old network(s) to keep ε = 0.01 # Random exploration factor in training env = gym.make('LunarLander-v2') replay_buffer = ReplayBuffer(1e6) batch_size = 128 q1 = Q(env) q1_target = deepcopy(q1) q1_optim = torch.optim.Adam(q1.parameters(), lr=lr) q2 = Q(env) q2_target = deepcopy(q2) q2_optim = torch.optim.Adam(q2.parameters(), lr=lr) def train(): explore(10000) # Explore the environment by taking random actions episode = 0 while episode < max_episodes: # // rougly begin algorithm from SAC+ERE
def train( args, log_dir, seed, env_id, replay_buffer_len, memory_len, cores, trees, p, # #nn items; reported number is 50 embed_size, # embedding vector length; reported number is ? gamma, # discount value; reported number is 0.99 N, # N-step bootstrapping; reported number is 100 update_period, # the reported number is 16//4 = 4 batch_size, # the reported number is 32 init_eps, delta, lr, q_lr, epsilon, min_epsilon, epsilon_decay, #exponential decaying factor eval_period, save_period, **kwargs): # another hyper params _gw = np.array([gamma**i for i in range(N)]) # expr setting Path(log_dir).mkdir(parents=True, exist_ok='temp' in log_dir) with open(os.path.join(log_dir, 'args.txt'), 'w') as f: f.write(str(args)) np.random.seed(seed) tf.random.set_random_seed(seed) # Env env = wrap_deepmind(make_atari(env_id), episode_life=False, clip_rewards=False, frame_stack=True, scale=False) num_ac = env.action_space.n # ReplayBuffer replay_buffer = ReplayBuffer(replay_buffer_len) # Neural Episodic Controller nec = NEC( num_ac, p, embed_size, delta, lr, q_lr, dnd_params={ 'maxlen': memory_len, 'seed': seed, 'cores': cores, # #cores for KD-Tree 'trees': trees, # #trees for KD-Tree }) sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer()) summary_writer = tf.summary.FileWriter(os.path.join( log_dir, 'tensorboard')) def _write_scalar(it, it_type, tag, value): summary = tf.Summary(value=[ tf.Summary.Value(tag=f"{tag}/{it_type}", simple_value=value) ]) summary_writer.add_summary(summary, global_step=it) ####### Setup Done num_steps = 0 num_updates = 0 # Fill up the memory and replay buffer with a random policy for ep in range(init_eps): ob = env.reset() obs, acs, rewards = [ob], [], [] for _ in itertools.count(): ac = np.random.randint(num_ac) ob, r, done, _ = env.step(ac) obs.append(ob) acs.append(ac) rewards.append(r) num_steps += 1 if done: break Rs = [ np.sum(_gw[:len(rewards[i:i + N])] * rewards[i:i + N]) for i in range(len(rewards)) ] obs = np.array(obs) es = nec._embed(obs) for ob, e, a, R in zip(obs, es, acs, Rs): nec.append(e, a, R) replay_buffer.append(ob, a, R) # Training! next_save_steps = save_period try: for ep in itertools.count(start=init_eps): ob = env.reset() obs, acs, rewards, es, Vs = [ob], [], [], [], [] for t in itertools.count(): # Epsilon Greedy Policy ac, (e, V) = nec.policy(ob) if np.random.random() < epsilon: ac = np.random.randint(num_ac) ob, r, done, _ = env.step(ac) obs.append(ob) acs.append(ac) rewards.append(r) es.append(e) Vs.append(V) num_steps += 1 # Train on random minibatch from replacy buffer if num_steps % update_period == 0: b_s, b_a, b_R = replay_buffer.sample(batch_size) loss = nec.update(b_s, b_a, b_R) num_updates += 1 if num_updates % 100 == 0: print(f'[{num_steps*4}/{num_updates}] loss: {loss}') _write_scalar(it=num_steps * 4, it_type='per_frames', tag='loss', value=loss) _write_scalar(it=num_updates, it_type='per_updates', tag='loss', value=loss) _write_scalar(it=num_steps * 4, it_type='per_frames', tag='num_updates', value=num_updates) if t >= N: # N-Step Bootstrapping # TODO: implement the efficient version R = np.sum( _gw * rewards[t - N:t]) + (gamma**N) * Vs[t] #R_{t-N} # append to memory nec.append(es[t - N], acs[t - N], R) # append to replay buffer replay_buffer.append(obs[t - N], acs[t - N], R) if done: break print( f'Episode {ep} -- Ep Len: {len(obs)} Acc Reward: {np.sum(rewards)} current epsilon: {epsilon}' ) _write_scalar(tag='ep', value=ep, it=num_steps * 4, it_type='per_frames') _write_scalar(tag='ep_len', value=len(obs), it=num_steps * 4, it_type='per_frames') _write_scalar(tag='ep_len', value=len(obs), it=ep, it_type='per_episode') _write_scalar(tag='eps_reward', value=np.sum(rewards), it=num_steps * 4, it_type='per_frames') _write_scalar(tag='eps_reward', value=np.sum(rewards), it=ep, it_type='per_episode') _write_scalar(tag='epsilon', value=epsilon, it=ep, it_type='per_episode') # Remaining items which is not bootstrappable; partial trajectory close to end of episode # Append to memory & replay buffer for t in range(len(rewards) - N, len(rewards)): R = np.sum([ gamma**(i - t) * rewards[i] for i in range(t, len(rewards)) ]) nec.append(es[t], acs[t], R) replay_buffer.append(obs[t], acs[t], R) # epsilon decay epsilon = max(min_epsilon, epsilon * epsilon_decay) # Save Model & Evaluatate if ep % eval_period == 0: try: ep_len, eps_reward = _run(env, nec, os.path.join( log_dir, f'test-{ep}.mp4'), maxlen=len(obs) * 3) print( f'Evaluation -- Episode {ep} -- Ep Len: {ep_len} Acc Reward: {eps_reward}' ) _write_scalar(tag='ep_len', value=ep_len, it=ep, it_type='per_episode_eval') _write_scalar(tag='eps_reward', value=eps_reward, it=ep, it_type='per_episode_eval') except RuntimeError as e: print(e) print('Evaluation -- Skipped') if num_steps >= next_save_steps: nec.save(log_dir, it=next_save_steps * 4) # iteration number -- num frames next_save_steps += save_period except KeyboardInterrupt: print('saving... please wait...') nec.save(log_dir) print('done!')
def train(sess, env, actor, critic, global_step): # Set up summary Ops summary_ops, summary_vars = build_summaries() sess.run(tf.global_variables_initializer()) # load model if have saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state("./results") if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) print("global step: ", global_step.eval()) else: print("Could not find old network weights") writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) i = global_step.eval() eval_acc_reward = 0 tic = time.time() eps = 1 while True: i += 1 s = env.reset() ep_ave_max_q = 0 eps *= EPS_DECAY_RATE eps = max(eps, EPS_MIN) if i % SAVE_STEP == 0: # save check point every 1000 episode sess.run(global_step.assign(i)) save_path = saver.save(sess, "./results/model.ckpt", global_step=global_step) print("Model saved in file: %s" % save_path) print("Successfully saved global step: ", global_step.eval()) for j in xrange(MAX_EP_STEPS): # print(s.shape) # Added exploration noise a = actor.predict(np.reshape(s, np.hstack((1, actor.s_dim)))) action_score = a[0] probs = np.exp(action_score - np.max(action_score)) # probs = np.exp(action_score) probs /= np.sum(probs) # np.random.seed() # epsilon = 0.5 # eps greedy # dice = np.random.uniform() # roll the dice! # if dice < epsilon: # action = np.argmax(probs) # else: action = np.random.choice(4, 1, p=probs) # print action s2, r, terminal, info = env.step(action) plt.imshow(s2, interpolation='none') plt.show() replay_buffer.add(np.reshape(s, (actor.s_dim)), np.reshape(a, (actor.a_dim)), r, \ terminal, np.reshape(s2, (actor.s_dim))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MINIBATCH_SIZE: s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(MINIBATCH_SIZE) # Calculate targets target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in xrange(MINIBATCH_SIZE): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + GAMMA * target_q[k]) # Update the critic given the targets # print(s_batch.shape) predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1))) ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() s = s2 eval_acc_reward += r if terminal: # print '| Reward: %.2i' % int(ep_reward), " | Episode", i, \ # '| Qmax: %.4f' % (ep_ave_max_q / float(j+1)) if i % EVAL_EPISODES == 0: # summary time_gap = time.time() - tic summary_str = sess.run( summary_ops, feed_dict={ summary_vars[0]: (eval_acc_reward + EVAL_EPISODES) / 2, summary_vars[1]: ep_ave_max_q / float(j + 1), }) writer.add_summary(summary_str, i) writer.flush() print ('| Success: %i %%' % ((eval_acc_reward+EVAL_EPISODES)/2), "| Episode", i, \ '| Qmax: %.4f' % (ep_ave_max_q / float(j+1)), ' | Time: %.2f' %(time_gap), ' | Eps: %.2f' %(eps)) tic = time.time() # print(' 100 round reward: ', eval_acc_reward) eval_acc_reward = 0 break
def train(sess, env, network): arr_reward = np.zeros(MAX_EPISODES) arr_qmax = np.zeros(MAX_EPISODES) actor = Actor(sess, network, ACTOR_LEARNING_RATE) actor_target = ActorTarget(sess, network, TAU) critic = Critic(sess, network, CRITIC_LEARNING_RATE) critic_target = CriticTarget(sess, network, TAU) s_dim, a_dim, _ = network.get_const() # Set up summary Ops summary_ops, summary_vars = build_summaries() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph) actor_target.train() critic_target.train() # Initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) for i in range(MAX_EPISODES): s = env.reset() ep_reward = 0 ep_ave_max_q = 0 for j in range(MAX_EP_STEPS): if RENDER_ENV: env.render() # Added exploration noise a = actor.predict(np.reshape(s, (1, s_dim))) + (1. / (1. + i)) s2, r, terminal, info = env.step(a[0]) replay_buffer.add(np.reshape(s, (s_dim,)), np.reshape(a, (a_dim,)), r, terminal, np.reshape(s2, (s_dim,))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MINIBATCH_SIZE: s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(MINIBATCH_SIZE) # Calculate targets target_q = critic_target.predict(s2_batch, actor_target.predict(s2_batch)) y_i = [] for k in range(MINIBATCH_SIZE): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + GAMMA * target_q[k]) # Update the critic given the targets predicted_q_value, _ = critic.train(s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1))) #ep_ave_max_q += np.amax(predicted_q_value) ep_ave_max_q += np.mean(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor_target.train() critic_target.train() s = s2 ep_reward += r if terminal: summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: ep_reward, summary_vars[1]: ep_ave_max_q / float(j) }) writer.add_summary(summary_str, i) writer.flush() print('Reward: ' + str(ep_reward) + ', Episode: ' + str(i) + ', Qmax: ' + str(ep_ave_max_q / float(j))) arr_reward[i] = ep_reward arr_qmax[i] = ep_ave_max_q / float(j) if i % 100 == 99: np.savez(RESULTS_FILE, arr_reward[0:i], arr_qmax[0:i]) break
def train_rollout(self, args, reward_result): # Set up summary Ops summary_ops, summary_vars = build_summaries() # Get dynamics and initialize prior controller prior = BasePrior() # Initialize target network weights self.actor.update_target_network() self.critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(int(args['buffer_size']), int(args['random_seed'])) # Needed to enable BatchNorm. tflearn.is_training(True) paths = list() lambda_store = np.zeros((int(args['max_episode_len']), 1)) for i in range(int(args['max_episodes'])): s = self.env.reset_inc() ep_reward = 0. ep_ave_max_q = 0 obs, action, act_prior, rewards, obs_ref, prior_ref, collisions = [], [], [], [], [], [], [] #Get reward using baseline controller s0 = np.copy(s) ep_reward_opt = 0. for kk in range(int(args['max_episode_len'])): a = self.env.getPrior() prior_ref.append(np.array([a])) s0, r, stop_c, act = self.env.step(a) ep_reward_opt += r obs_ref.append(s0) if (stop_c): break # Get reward using regRL algorithm s = self.env.reset() for j in range(int(args['max_episode_len'])): # Set control prior regularization weight lambda_mix = 15. lambda_store[j] = lambda_mix # Get control prior a_prior = self.env.getPrior() # Rl control with exploration noise ab = self.actor.predict(np.reshape( s, (1, self.actor.s_dim))) + self.actor_noise() # Mix the actions (RL controller + control prior) act = ab[0] / (1 + lambda_mix) + (lambda_mix / (1 + lambda_mix)) * a_prior # Take action and observe next state/reward s2, r, terminal, act = self.env.step(act) collisions.append(self.env.collision_flag) act = np.array(act, ndmin=1) # Add info from time step to the replay buffer replay_buffer.add(np.reshape(s, (self.actor.s_dim, )), np.reshape(ab, (self.actor.a_dim, )), r, terminal, np.reshape(s2, (self.actor.s_dim, ))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > int(args['minibatch_size']): #Sample a batch from the replay buffer s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch( int(args['minibatch_size'])) # Calculate targets target_q = self.critic.predict_target( s2_batch, self.actor.predict_target(s2_batch)) y_i = [] for k in range(int(args['minibatch_size'])): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + self.critic.gamma * target_q[k]) # Update the critic given the targets predicted_q_value, _ = self.critic.train( s_batch, a_batch, np.reshape(y_i, (int(args['minibatch_size']), 1))) ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = self.actor.predict(s_batch) grads = self.critic.action_gradients(s_batch, a_outs) self.actor.train(s_batch, grads[0]) # Update target networks self.actor.update_target_network() self.critic.update_target_network() s = s2 ep_reward += r obs.append(s) rewards.append(r) action.append(act) act_prior.append(np.array([a_prior])) # Collect results at end of episode if terminal: print( '| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format( int(ep_reward - ep_reward_opt), i, (ep_ave_max_q / float(j)))) reward_result[0, i] = ep_reward reward_result[1, i] = ep_reward_opt reward_result[2, i] = np.mean(lambda_store) reward_result[3, i] = max(collisions) path = { "Observation": np.concatenate(obs).reshape((-1, 6)), "Observation_ref": np.concatenate(obs_ref).reshape( (-1, 6)), "Action": np.concatenate(action), "Action_Prior": np.concatenate(act_prior), "Action_Prior_Ref": np.concatenate(prior_ref), "Reward": np.asarray(rewards) } paths.append(path) break return [summary_ops, summary_vars, paths, reward_result]
def main(args): file_name = 'td3_lunalander_v2' writer = SummaryWriter(log_dir="logs/{}_{}".format(file_name, 'numeric')) device = 'cuda' if torch.cuda.is_available() else 'cpu' env = gym.make('LunarLanderContinuous-v2') env.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) max_action = float(env.action_space.high[0]) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] replay_buffer = ReplayBuffer(int(1e6), [state_dim], action_dim) policy = TD3(env, state_dim, action_dim, max_action) state, done = env.reset(), False episode_reward = 0 episode_timesteps = 0 episode_num = 0 for t in range(int(args.max_timesteps)): episode_timesteps += 1 # Select action randomly or according to policy if t < args.start_timesteps: action = env.action_space.sample() else: obs_tens = torch.from_numpy(state).float().reshape(1, -1).to(device) # action = np.clip(policy.select_action(obs_tens) + np.random.normal(0, max_action * args.expl_noise, size=3), -max_action, max_action) # action = np.clip(policy.select_action(obs_tens.to(device)) + np.random.normal(0, max_action * args.expl_noise), -max_action, max_action) action = policy.select_action(obs_tens.to(device)) if episode_num > int(1000): env.render() # Perform action next_state, reward, done, _ = env.step(action) done_bool = float( done) if episode_timesteps < env._max_episode_steps else 0 # Store data in replay buffer replay_buffer.store_transition(state, action, reward, next_state, done_bool) state = next_state episode_reward += reward # Train agent after collecting sufficient data if t >= args.start_timesteps: policy.train(replay_buffer, args.batch_size) if done: # +1 to account for 0 indexing. +0 on ep_timesteps since it will increment +1 even if done=True print( f"Total T: {t+1} Episode Num: {episode_num+1} Episode T: {episode_timesteps} Reward: {episode_reward:.3f}" ) writer.add_scalar("reward", episode_reward, episode_num + 1) # Reset environment state, done = env.reset(), False episode_reward = 0 episode_timesteps = 0 episode_num += 1 if t % 100 == 0: policy.save(file_name)
env = wrappers.NormalizeActions(env) env = wrappers.MinimumDuration(env, len_time) env = wrappers.MaximumDuration(env, len_time) env = wrappers.ObservationDict(env, key='observation') env = wrappers.PixelObservations(env, image_res, np.uint8, 'image') env = wrappers.ConvertRewardToCost(env) env = wrappers.ConvertTo32Bit(env) # SEED EXPERIMENT TO CREATE REPRODUCIBLE RESULTS seed_value = 0 seed_experiment(seed=seed_value) env.seed(seed=seed_value) # GET ENVIRONMENT DATA SHAPES observation_shape = env.observation_space['image'].shape action_shape = env.action_space.shape state_shape = env.state_space.shape # INITIALIZE INFRASTRUCTURE logger = Logger('.') replay_buffer = ReplayBuffer(observation_shape, action_shape, state_shape, max_num_episodes, len_time) driver = Driver(env, replay_buffer=replay_buffer) # GATHER EXPERIENCE print('Generating dataset. Generate %d episodes of length %d.' % (max_num_episodes, len_time + 1)) driver.run(render=True, num_steps=max_num_episodes * len_time, logger=logger) # SAVE DATASET replay_buffer.save_buffer('.', name_dataset='dataset')
def train(sess, env, args, actor, critic, actor_noise): # Set up summary operations summary_ops, summary_vars = build_summaries() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(args['summary_dir'], sess.graph) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(int(args['buffer_size']), int(args['random_seed'])) # Needed to enable BatchNorm. # This hurts the performance on Pendulum but could be useful # in other environments. # tflearn.is_training(True) for i in range(int(args['max_episodes'])): # Reset the environment, initial action 0, and initialize the action list for observability during analysis s = env.reset() actor_noise.reset() a = 0 a_list = [] # Evaluation Period eval_time = 999 # Episode reward and episode average max Q initializations ep_reward = 0 ep_ave_max_q = 0 # Initialize zero mean and st_dev. Will be corrected before mean = env.xs st_dev = [0.8, 15, 0.7] if i % 50 == 0 and i != 0: print("Evaluation Episode") for j in range(1, int(args['max_episode_len']) + 1): # Take action every "sampling time" time steps to ensure steady state is reached if j % int(args['sampling_time']) == 0: # Normalize the states by subtracting the mean and dividing by the variance s -= mean s /= st_dev # Adding Ornstein-Ulhenbeck exploration noise to the action if i % 50 == 0 and i != 0: # Every 50th episode, the action will have no noise to evaluate performance. a = actor.predict(np.reshape(s, (1, actor.s_dim))) if i == (args['max_episodes'] - 1): a_list.append(a) else: noise = actor_noise() a = actor.predict(np.reshape(s, (1, actor.s_dim))) + noise if i == (args['max_episode_len'] - 1): a_list.append(a - noise) # Take the action env.u[j, 0] = env.u[j - 1, 0] + a[0] # Define evaluation time for feedback eval_time = j + int(args['sampling_time']) - 1 else: # If it is not the sampling time, keep input constant env.u[j, 0] = env.u[j - 1, 0] # Simulate the next step env.x[j, :] = env.cstr_sim.sim(env.x[j - 1, :], env.u[j, :]) # Determines if its the end of the current episode. If the input is very far from ideal, episode ends. if j == env.Nsim or env.u[j, 0] < 150 or env.u[j, 0] > 450: terminal = True else: terminal = False # Feedback for RL if j == eval_time: # Ensure feedback is evaluated correctly assert ((j + 1) % int(args['sampling_time']) == 0) # Reward for RL r = env.reward_function(j, a[0][0]) # Next state for RL s2 = deepcopy(env.x[j, :]) # Add the latest states, action, reward, terminal, and new state to the replay memory replay_buffer.add(np.reshape(s, (actor.s_dim, )), np.reshape(a, (actor.a_dim, )), r, terminal, np.reshape(s2, (actor.s_dim, ))) # Update the new state to be the current state s = s2 # Add the step's reward towards the whole episodes' reward ep_reward += r # Keep adding experience to the memory until there are at least mini-batch size samples # Batch Training area if replay_buffer.size() > int(args['minibatch_size'] * 5): # mini-batch size mini_batch_size = np.power(i, 1 / 3) * int( args['minibatch_size']) # Obtain a batch of data from replay buffer s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch( int(mini_batch_size)) # Calculate critic target Q-value, feeding in the actor target action # States is the s2 from the replay buffer target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) # Calculate the Q values y_i = [] for k in range(int(mini_batch_size)): # Terminal state, Q = r because there is no additional trajectory beyond this point if t_batch[k]: y_i.append(r_batch[k]) # If state is not terminal, Q = r + gamma * argmax-a * Q(s', a) else: y_i.append(r_batch[k] + critic.gamma * target_q[k]) """ Update the critic given the targets Exact algorithm: critic.train() returns predicted_q_value, optimize. Optimize takes MSE of y_i and predicted q value out. Then does Adam Gradient Descent updating the critic network. """ predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (int(mini_batch_size), 1))) # Output is 64 dimen predicted_q_value, then find the max of them. ep_ave_max_q += np.amax(predicted_q_value) """ Update the actor policy using the sampled gradient """ # Scaled output action given the s_batch states. a_outs = actor.predict(s_batch) # Inputs the states, and the actions given those states. # Forms symbolic function of the gradients as a function of the action grads = critic.action_gradients(s_batch, a_outs) # Updates actors given the gradients actor.train(s_batch, grads[0]) # Update target networks by tau actor.update_target_network() critic.update_target_network() if terminal: # Update the summary ops summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: ep_reward, summary_vars[1]: ep_ave_max_q / float(j) }) writer.add_summary(summary_str, i) writer.flush() print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format( int(ep_reward), i, (ep_ave_max_q / float(j)))) break return replay_buffer, a_list
def main(): def make_obs_ph(name): return U.BatchInput(env.observation_space.shape, name=name) # return U.BatchInput(env.observation_space.shape, name) # env = gym.make("CartPole-v0") # env = gym.make("CartPole-v1") # env = gym.make("Acrobot-v1") env = gym.make("MountainCar-v0") # model = models.mlp([32]) model = models.mlp([64]) # model = models.mlp([16, 16]) # parameters q_func=model lr=1e-3 max_timesteps=100000 # max_timesteps=10000 buffer_size=50000 exploration_fraction=0.1 # exploration_fraction=0.3 exploration_final_eps=0.02 train_freq=1 batch_size=32 print_freq=10 checkpoint_freq=10000 learning_starts=1000 gamma=1.0 target_network_update_freq=500 # prioritized_replay=False prioritized_replay=True prioritized_replay_alpha=0.6 prioritized_replay_beta0=0.4 prioritized_replay_beta_iters=None prioritized_replay_eps=1e-6 num_cpu=16 # # try mountaincar w/ different input dimensions # inputDims = [50,2] sess = U.make_session(num_cpu) sess.__enter__() act, train, update_target, debug = build_graph.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10 ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() # with tempfile.TemporaryDirectory() as td: model_saved = False # model_file = os.path.join(td, "model") for t in range(max_timesteps): # Take action and update exploration to the newest value action = act(np.array(obs)[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: # if done: print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t)))) # if done and print_freq is not None and len(episode_rewards) % print_freq == 0: # logger.record_tabular("steps", t) # logger.record_tabular("episodes", num_episodes) # logger.record_tabular("mean 100 episode reward", mean_100ep_reward) # logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) # logger.dump_tabular() # sess plt.plot(episode_rewards) plt.show() sess
def main(): # env = gym.make("CartPoleRob-v0") # env = gym.make("CartPole-v0") # env = gym.make("CartPole-v1") # env = gym.make("Acrobot-v1") # env = gym.make("MountainCarRob-v0") # env = gym.make("FrozenLake-v0") # env = gym.make("FrozenLake8x8-v0") # env = gym.make("FrozenLake8x8rob-v0") # env = gym.make("FrozenLake16x16rob-v0") env = gym.make("TestRob3-v0") # robShape = (2,) # robShape = (3,) # robShape = (200,) # robShape = (16,) # robShape = (64,) robShape = (8, 8, 1) # robShape = (16,16,1) def make_obs_ph(name): # return U.BatchInput(env.observation_space.shape, name=name) return U.BatchInput(robShape, name=name) # # these params are specific to mountaincar # def getOneHotObs(obs): # obsFraction = (obs[0] + 1.2) / 1.8 # idx1 = np.int32(np.trunc(obsFraction*100)) # obsFraction = (obs[1] + 0.07) / 0.14 # idx2 = np.int32(np.trunc(obsFraction*100)) # ident = np.identity(100) # return np.r_[ident[idx1,:],ident[idx2,:]] # these params are specific to frozenlake def getOneHotObs(obs): # ident = np.identity(16) ident = np.identity(64) # ident = np.identity(256) # return ident[obs,:] return np.reshape(ident[obs, :], [8, 8, 1]) # return np.reshape(ident[obs,:],[16,16,1]) # model = models.mlp([32]) # model = models.mlp([64]) # model = models.mlp([64], layer_norm=True) # model = models.mlp([16, 16]) # conv model parameters: (num_outputs, kernel_size, stride) model = models.cnn_to_mlp( # convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], # used in pong # hiddens=[256], # used in pong convs=[(8, 4, 1)], hiddens=[16], dueling=True) # parameters q_func = model lr = 1e-3 # max_timesteps=100000 # max_timesteps=50000 max_timesteps = 20000 buffer_size = 50000 # exploration_fraction=0.1 exploration_fraction = 0.2 exploration_final_eps = 0.02 # exploration_final_eps=0.1 train_freq = 1 batch_size = 32 print_freq = 10 checkpoint_freq = 10000 learning_starts = 1000 gamma = 1. target_network_update_freq = 500 prioritized_replay = False # prioritized_replay=True prioritized_replay_alpha = 0.6 prioritized_replay_beta0 = 0.4 prioritized_replay_beta_iters = None prioritized_replay_eps = 1e-6 num_cpu = 16 # # try mountaincar w/ different input dimensions # inputDims = [50,2] sess = U.make_session(num_cpu) sess.__enter__() act, train, update_target, debug = build_graph.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, double_q=False) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() # obs = np.reshape(obs,[8,8,1]) # obs = getOneHotObs(obs) # with tempfile.TemporaryDirectory() as td: model_saved = False # model_file = os.path.join(td, "model") for t in range(max_timesteps): # env.render() # Take action and update exploration to the newest value action = act(np.array(obs)[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) # new_obs = getOneHotObs(new_obs) # env.render() # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() # obs = getOneHotObs(obs) episode_rewards.append(0.0) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: # if done: print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t)))) # if done and print_freq is not None and len(episode_rewards) % print_freq == 0: # logger.record_tabular("steps", t) # logger.record_tabular("episodes", num_episodes) # logger.record_tabular("mean 100 episode reward", mean_100ep_reward) # logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) # logger.dump_tabular() # sess num2avg = 20 rListAvg = np.convolve(episode_rewards, np.ones(num2avg)) / num2avg plt.plot(rListAvg) # plt.plot(episode_rewards) plt.show() sess
def train_agent(args): """ Args: """ # create CNN convert the [1,3,84,84] to [1, 200] now = datetime.now() dt_string = now.strftime("%d_%m_%Y_%H:%M:%S") torch.manual_seed(args.seed) np.random.seed(args.seed) pathname = str(args.locexp) + "/" + str(args.env_name) + '_agent_' + str( args.policy) pathname += "_batch_size_" + str(args.batch_size) + "_lr_act_" + str( args.lr_actor) pathname += "_lr_critc_" + str(args.lr_critic) + "_lr_decoder_" arg_text = str(args) write_into_file(pathname, arg_text) tensorboard_name = str(args.locexp) + '/runs/' + pathname writer = SummaryWriter(tensorboard_name) size = args.size env = gym.make(args.env_name, renderer='egl') state = env.reset() print("state ", state.shape) state_dim = 200 print("State dim, ", state_dim) action_dim = 5 print("action_dim ", action_dim) max_action = 1 args.target_entropy = -np.prod(action_dim) args.max_episode_steps = 200 file_name = str(args.locexp) + "/pytorch_models/{}".format(args.env_name) obs_shape = (args.history_length, size, size) action_shape = (action_dim, ) print("obs", obs_shape) print("act", action_shape) policy = TQC(state_dim, action_dim, max_action, args) replay_buffer = ReplayBuffer(obs_shape, action_shape, int(args.buffer_size), args.image_pad, args.device) total_timesteps = 0 timesteps_since_eval = 0 episode_num = 0 done = True t0 = time.time() scores_window = deque(maxlen=100) episode_reward = 0 evaluations = [] tb_update_counter = 0 # TODO: evaluate #evaluations.append(evaluate_policy(policy, writer, total_timesteps, args, env)) #save_model = file_name + '-{}reward_{:.2f}-agent{}'.format(episode_num, evaluations[-1], args.policy) #policy.save(save_model) done_counter = deque(maxlen=100) while total_timesteps < args.max_timesteps: tb_update_counter += 1 # If the episode is done if done: episode_num += 1 #env.seed(random.randint(0, 100)) scores_window.append(episode_reward) average_mean = np.mean(scores_window) if total_timesteps > args.start_timesteps and episode_num % args.update_beta_freq == 0: policy.update_beta(replay_buffer, writer, total_timesteps) if tb_update_counter > args.tensorboard_freq: print("Write tensorboard") tb_update_counter = 0 writer.add_scalar('Reward', episode_reward, total_timesteps) writer.add_scalar('Reward mean ', average_mean, total_timesteps) writer.flush() # If we are not at the very beginning, we start the training process of the model if total_timesteps != 0: if episode_timesteps < 50: done_counter.append(1) else: done_counter.append(0) goals = sum(done_counter) text = "Total Timesteps: {} Episode Num: {} ".format( total_timesteps, episode_num) text += "Episode steps {} ".format(episode_timesteps) text += "Goal last 100 ep : {} ".format(goals) text += "Reward: {:.2f} Average Re: {:.2f} Time: {}".format( episode_reward, np.mean(scores_window), time_format(time.time() - t0)) writer.add_scalar('Goal_freq', goals, total_timesteps) print(text) write_into_file(pathname, text) #policy.train(replay_buffer, writer, episode_timesteps) # We evaluate the episode and we save the policy if timesteps_since_eval >= args.eval_freq: timesteps_since_eval %= args.eval_freq evaluations.append( evaluate_policy(policy, writer, total_timesteps, args, env)) torch.manual_seed(args.seed) np.random.seed(args.seed) evaluations.append( evaluate_policy(policy, writer, total_timesteps, args, env)) save_model = file_name + '-{}reward_{:.2f}-agent{}'.format( episode_num, evaluations[-1], args.policy) policy.save(save_model) # When the training step is done, we reset the state of the environment state = env.reset() obs, state_buffer = stacked_frames(state, size, args, policy) # Set the Done to False done = False # Set rewards and episode timesteps to zero episode_reward = 0 episode_timesteps = 0 # Before 10000 timesteps, we play random actions if total_timesteps < args.start_timesteps: action = env.action_space.sample() else: # After 10000 timesteps, we switch to the model action = policy.select_action(obs) # The agent performs the action in the environment, then reaches the next state and receives the reward new_obs, reward, done, _ = env.step(action) # print(reward) #frame = cv2.imshow("wi", np.array(new_obs)) #cv2.waitKey(10) done = float(done) new_obs, state_buffer = create_next_obs(new_obs, size, args, state_buffer, policy) # We check if the episode is done #done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done) done_bool = 0 if episode_timesteps + 1 == args.max_episode_steps else float( done) if episode_timesteps + 1 == args.max_episode_steps: done = True # We increase the total reward reward = reward * args.reward_scalling episode_reward += reward # We store the new transition into the Experience Replay memory (ReplayBuffer) if args.debug: print("add to buffer next_obs ", obs.shape) print("add to bufferobs ", new_obs.shape) replay_buffer.add(obs, action, reward, new_obs, done, done_bool) # We update the state, the episode timestep, the total timesteps, and the timesteps since the evaluation of the policy obs = new_obs if total_timesteps > args.start_timesteps: for i in range(args.repeat_update): policy.train(replay_buffer, writer, 1) episode_timesteps += 1 total_timesteps += 1 timesteps_since_eval += 1 # We add the last policy evaluation to our list of evaluations and we save our model evaluations.append( evaluate_policy(policy, writer, total_timesteps, args, episode_num))
TARGET_UPDATE_EPISODE = 10 EPS_START = 0.9 EPS_END = 0.1 EPS_DECAY = 10000 # we need two DQN network policy = DQN(POLICY_ARGS).to(DEVICE) # print(policy) target = DQN(POLICY_ARGS).to(DEVICE) policy_weight = policy.state_dict() target.load_state_dict(policy_weight) target.eval() # fixed the target net, we don't want to train it mse = nn.MSELoss() optimizer = optim.RMSprop(policy.parameters()) replay_buffer = ReplayBuffer(BUFFER_SIZE) # training phase total_game_step = 0 for current_episode in range(EPISODE): state = env.reset() # get the initial observation game_step = 0 total_reward = 0 state = torch.tensor([state]).float().to(DEVICE) while True: game_step += 1 total_game_step += 1 action = policy.act(state, total_game_step, isTrain = True).to(DEVICE) # sample an action next_state, reward, done, _ = env.step(action.item()) # take action in environment total_reward += reward reward = torch.tensor([reward]).float().to(DEVICE)
s = env.reset() reward = 0 for _ in range(t_max): qvalues = agent.get_qvalues([s]) action = qvalues.argmax(axis=-1)[0] if greedy else agent.sample_actions(qvalues)[0] s, r, done, _ = env.step(action) reward += r if done: break rewards.append(reward) return np.mean(rewards) evaluate(env, agent, n_games=1) from replay_buffer import ReplayBuffer exp_replay = ReplayBuffer(10) for _ in range(30): exp_replay.add(env.reset(), env.action_space.sample(), 1.0, env.reset(), done=False) obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch = exp_replay.sample(5) assert len(exp_replay) == 10, "experience replay size should be 10 because that's what maximum capacity is" def play_and_record(agent, env, exp_replay, n_steps=1): """ Play the game for exactly n steps, record every (s,a,r,s', done) to replay buffer. Whenever game ends, add record with done=True and reset the game. :returns: return sum of rewards over time Note: please do not env.reset() unless env is done.
def train(sess, env, args, actor, critic, actor_noise): # Set up summary Ops summary_ops, summary_vars = build_summaries() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(args['summary_dir'], sess.graph) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(int(args['buffer_size']), int(args['random_seed'])) # Needed to enable BatchNorm. # This hurts the performance on Pendulum but could be useful # in other environments. # tflearn.is_training(True) for i in range(int(args['max_episodes'])): s = env.reset() ep_reward = 0 ep_ave_max_q = 0 for j in range(int(args['max_episode_len'])): if args['render_env']: env.render() # Added exploration noise #a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i)) #print("actor.s_dim: ", actor.s_dim) #print("s: ", s) a = actor.predict(np.reshape(s, (1, actor.s_dim))) + actor_noise() s2, r, terminal, info = env.step(a[0]) replay_buffer.add(np.reshape(s, (actor.s_dim, )), np.reshape(a, (actor.a_dim, )), r, terminal, np.reshape(s2, (actor.s_dim, ))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > int(args['minibatch_size']): s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(int(args['minibatch_size'])) # Calculate targets target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in range(int(args['minibatch_size'])): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + critic.gamma * target_q[k]) # Update the critic given the targets predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (int(args['minibatch_size']), 1))) ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() s = s2 ep_reward += r #print("s: ", s) #print("ep_reward: ", ep_reward) if terminal: summary_str = sess.run( summary_ops, feed_dict={ #summary_vars[0]: ep_reward, summary_vars[0]: ep_reward, summary_vars[1]: ep_ave_max_q / float(j) }) writer.add_summary(summary_str, i) writer.flush() print('| Reward / float(j): {:d} | Episode: {:d} | Qmax: {:.4f}'.format(int(ep_reward), \ i, (ep_ave_max_q / float(j)))) break
def train(sess, env, actor, critic): # Set up summary Ops summary_ops, summary_vars = build_summaries() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) for i in xrange(MAX_EPISODES): s = env.reset() ep_reward = 0 ep_ave_max_q = 0 for j in xrange(MAX_EP_STEPS): # Added exploration noise a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i + j)) s2, r, terminal, info = env.step(a[0]) replay_buffer.add(np.reshape(s, (actor.s_dim,)), np.reshape(a, (actor.a_dim,)), r, \ terminal, np.reshape(s2, (actor.s_dim,))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MINIBATCH_SIZE: s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(MINIBATCH_SIZE) # Calculate targets target_q = critic.predict_target(s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in xrange(MINIBATCH_SIZE): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + GAMMA * target_q[k]) # Update the critic given the targets predicted_q_value, _ = critic.train(s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1))) ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() s = s2 ep_reward += r if terminal: summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: ep_reward, summary_vars[1]: ep_ave_max_q / float(j) }) writer.add_summary(summary_str, i) writer.flush() print '| Reward: %.2i' % int(ep_reward), " | Episode", i, \ '| Qmax: %.4f' % (ep_ave_max_q / float(j)) break
def main(_): """Run td3/ddpg training.""" contrib_eager_python_tfe.enable_eager_execution() if FLAGS.use_gpu: tf.device('/device:GPU:0').__enter__() tf.gfile.MakeDirs(FLAGS.log_dir) summary_writer = contrib_summary.create_file_writer(FLAGS.log_dir, flush_millis=10000) tf.set_random_seed(FLAGS.seed) np.random.seed(FLAGS.seed) random.seed(FLAGS.seed) env = gym.make(FLAGS.env) env.seed(FLAGS.seed) if FLAGS.env in ['HalfCheetah-v2', 'Ant-v1']: rand_actions = int(1e4) else: rand_actions = int(1e3) obs_shape = env.observation_space.shape act_shape = env.action_space.shape if FLAGS.algo == 'td3': model = ddpg_td3.DDPG(obs_shape[0], act_shape[0], use_td3=True, policy_update_freq=2, actor_lr=1e-3) else: model = ddpg_td3.DDPG(obs_shape[0], act_shape[0], use_td3=False, policy_update_freq=1, actor_lr=1e-4) replay_buffer_var = contrib_eager_python_tfe.Variable('', name='replay_buffer') gym_random_state_var = contrib_eager_python_tfe.Variable( '', name='gym_random_state') np_random_state_var = contrib_eager_python_tfe.Variable( '', name='np_random_state') py_random_state_var = contrib_eager_python_tfe.Variable( '', name='py_random_state') saver = contrib_eager_python_tfe.Saver( model.variables + [replay_buffer_var] + [gym_random_state_var, np_random_state_var, py_random_state_var]) tf.gfile.MakeDirs(FLAGS.save_dir) reward_scale = contrib_eager_python_tfe.Variable(1, name='reward_scale') eval_saver = contrib_eager_python_tfe.Saver(model.actor.variables + [reward_scale]) tf.gfile.MakeDirs(FLAGS.eval_save_dir) last_checkpoint = tf.train.latest_checkpoint(FLAGS.save_dir) if last_checkpoint is None: replay_buffer = ReplayBuffer() total_numsteps = 0 prev_save_timestep = 0 prev_eval_save_timestep = 0 else: saver.restore(last_checkpoint) replay_buffer = pickle.loads(zlib.decompress( replay_buffer_var.numpy())) total_numsteps = int(last_checkpoint.split('-')[-1]) assert len(replay_buffer) == total_numsteps prev_save_timestep = total_numsteps prev_eval_save_timestep = total_numsteps env.unwrapped.np_random.set_state( pickle.loads(gym_random_state_var.numpy())) np.random.set_state(pickle.loads(np_random_state_var.numpy())) random.setstate(pickle.loads(py_random_state_var.numpy())) with summary_writer.as_default(): while total_numsteps < FLAGS.training_steps: rollout_reward, rollout_timesteps = do_rollout( env, model.actor, replay_buffer, noise_scale=FLAGS.exploration_noise, rand_actions=rand_actions) total_numsteps += rollout_timesteps logging.info('Training: total timesteps %d, episode reward %f', total_numsteps, rollout_reward) print('Training: total timesteps {}, episode reward {}'.format( total_numsteps, rollout_reward)) with contrib_summary.always_record_summaries(): contrib_summary.scalar('reward', rollout_reward, step=total_numsteps) contrib_summary.scalar('length', rollout_timesteps, step=total_numsteps) if len(replay_buffer) >= FLAGS.min_samples_to_start: for _ in range(rollout_timesteps): time_step = replay_buffer.sample( batch_size=FLAGS.batch_size) batch = TimeStep(*zip(*time_step)) model.update(batch) if total_numsteps - prev_save_timestep >= FLAGS.save_interval: replay_buffer_var.assign( zlib.compress(pickle.dumps(replay_buffer))) gym_random_state_var.assign( pickle.dumps(env.unwrapped.np_random.get_state())) np_random_state_var.assign( pickle.dumps(np.random.get_state())) py_random_state_var.assign(pickle.dumps(random.getstate())) saver.save(os.path.join(FLAGS.save_dir, 'checkpoint'), global_step=total_numsteps) prev_save_timestep = total_numsteps if total_numsteps - prev_eval_save_timestep >= FLAGS.eval_save_interval: eval_saver.save(os.path.join(FLAGS.eval_save_dir, 'checkpoint'), global_step=total_numsteps) prev_eval_save_timestep = total_numsteps
def calc_po_best_response_PER(poacher, target_poacher, po_copy_op, po_good_copy_op, patrollers, pa_s, pa_type, iteration, sess, env, args, final_utility, starting_e, train_episode_num = None): ''' Given a list of patrollers, and their types (DQN, PARAM, RS) Train a DQN poacher as the approximating best response Args: poacher: DQN poacher target_poacher: target DQN poacher po_copy_op: tensorflow copy opertaions, copy the weights from DQN to the target DQN po_good_copy_op: tensorflow copy operations, save the trained ever-best poacher DQN patrollers: a list of patrollers pa_s: the patroller mixed startegy among the list of patrollers pa_type: a list specifying the type of each patroller, {'DQN', 'PARAM', 'RS'} iteration: the current DO iterations sess: tensorflow sess env: the game environment args: some args final_utility: record the best response utility starting_e: the starting of the training epoch Return: Nothing explictly returned due to multithreading. The best response utility is returned in $final_utility$ The best response DQN is copied through the $po_good_copy_op$ ''' #print('FIND_poacher_best_response iteration: ' + str(iteration)) if train_episode_num is None: train_episode_num = args.po_episode_num decrease_time = 1.0 / args.epsilon_decrease epsilon_decrease_every = train_episode_num // decrease_time if not args.PER: replay_buffer = ReplayBuffer(args, args.po_replay_buffer_size) else: replay_buffer = PERMemory(args) pa_strategy = pa_s best_utility = -10000.0 test_utility = [] if starting_e == 0: log = open(args.save_path + 'po_log_train_iter_' + str(iteration) + '.dat', 'w') test_log = open(args.save_path + 'po_log_test_iter_' + str(iteration) + '.dat', 'w') else: log = open(args.save_path + 'po_log_train_iter_' + str(iteration) + '.dat', 'a') test_log = open(args.save_path + 'po_log_test_iter_' + str(iteration) + '.dat', 'a') epsilon = 1.0 learning_rate = args.po_initial_lr global_step = 0 action_id = { ('still', 1): 0, ('up', 0): 1, ('down', 0): 2, ('left', 0): 3, ('right', 0): 4 } sess.run(po_copy_op) for e in range(starting_e, starting_e + train_episode_num): if e > 0 and e % epsilon_decrease_every == 0: epsilon = max(0.1, epsilon - args.epsilon_decrease) if e % args.mix_every_episode == 0 or e == starting_e: pa_chosen_strat = np.argmax(np.random.multinomial(1, pa_strategy)) patroller = patrollers[pa_chosen_strat] type = pa_type[pa_chosen_strat] # if args.gui == 1 and e > 0 and e % args.gui_every_episode == 0: # test_gui(poacher, patroller, sess, args, pah = heurestic_flag, poh = False) ### reset the environment poacher.reset_snare_num() pa_state, po_state = env.reset_game() episode_reward = 0.0 pa_action = 'still' for t in range(args.max_time): global_step += 1 transition = [] ### transition adds current state transition.append(po_state) ### poacher chooses an action, if it has not been caught/returned home if not env.catch_flag and not env.home_flag: po_state = np.array([po_state]) snare_flag, po_action = poacher.infer_action(sess=sess, states=po_state, policy="epsilon_greedy", epsilon=epsilon, po_loc=env.po_loc, animal_density=env.animal_density) else: snare_flag = True po_action = 'still' transition.append(action_id[(po_action, snare_flag)]) ### patroller chooses an action ### Note that heuristic and DQN agent has different APIs if type == 'DQN': pa_state = np.array([pa_state]) # Make it 2-D, i.e., [batch_size(1), state_size] pa_action = patroller.infer_action(sess=sess, states=pa_state, policy="greedy", pa_loc=env.pa_loc, animal_density=env.animal_density) elif type == 'PARAM': pa_loc = env.pa_loc pa_action = patroller.infer_action(pa_loc, env.get_local_po_trace(pa_loc), 1.5, -2.0, 8.0) elif type == 'RS': pa_loc = env.pa_loc footprints = [] actions = ['up', 'down', 'left', 'right'] for i in range(4,8): if env.po_trace[pa_loc[0], pa_loc[1]][i] == 1: footprints.append(actions[i - 4]) pa_action = patroller.infer_action(pa_loc, pa_action, footprints) pa_state, _, po_state, po_reward, end_game = \ env.step(pa_action, po_action, snare_flag) ### transition adds reward, and the new state transition.append(po_reward) transition.append(po_state) episode_reward += po_reward ### Add transition to replay buffer replay_buffer.add_transition(transition) ### Start training ### Sample a minibatch if replay_buffer.size >= args.batch_size: if not args.PER: train_state, train_action, train_reward, train_new_state = \ replay_buffer.sample_batch(args.batch_size) else: train_state, train_action, train_reward,train_new_state, \ idx_batch, weight_batch = replay_buffer.sample_batch(args.batch_size) ### Double DQN get target max_index = poacher.get_max_q_index(sess=sess, states=train_new_state) max_q = target_poacher.get_q_by_index(sess=sess, states=train_new_state, index=max_index) q_target = train_reward + args.reward_gamma * max_q if args.PER: q_pred = sess.run(poacher.output, {poacher.input_state: train_state}) q_pred = q_pred[np.arange(args.batch_size), train_action] TD_error_batch = np.abs(q_target - q_pred) replay_buffer.update(idx_batch, TD_error_batch) if not args.PER: weight = np.ones(args.batch_size) else: weight = weight_batch ### Update parameter feed = { poacher.input_state: train_state, poacher.actions: train_action, poacher.q_target: q_target, poacher.learning_rate: learning_rate, poacher.loss_weight: weight } sess.run(poacher.train_op, feed_dict=feed) ### Update target network if global_step > 0 and global_step % args.target_update_every == 0: sess.run(po_copy_op) ### game ends: 1) the patroller catches the poacher and removes all the snares; ### 2) the maximum time step is achieved if end_game or (t == args.max_time - 1): info = str(e) + "\tepisode\t%s\tlength\t%s\ttotal_reward\t%s\taverage_reward\t%s" % \ (e, t + 1, episode_reward, 1. * episode_reward / (t + 1)) if e % args.print_every == 0: log.write(info + '\n') print('po ' + info) #log.flush() break ### save model if e > 0 and e % args.save_every_episode == 0 or e == train_episode_num - 1: save_name = args.save_path + 'iteration_' + str(iteration) + '_epoch_'+ str(e) + "_po_model.ckpt" poacher.save(sess=sess, filename=save_name) #print('Save model to ' + save_name) ### test if e == train_episode_num - 1 or ( e > 0 and e % args.test_every_episode == 0): po_utility = 0.0 test_total_reward = np.zeros(len(pa_strategy)) ### test against each patroller strategy in the current strategy set for pa_strat in range(len(pa_strategy)): if pa_strategy[pa_strat] > 1e-10: _, test_total_reward[pa_strat], _ = test_(patrollers[pa_strat], poacher, \ env, sess,args, iteration, e, poacher_type = 'DQN', patroller_type = pa_type[pa_strat]) po_utility += pa_strategy[pa_strat] * test_total_reward[pa_strat] test_utility.append(po_utility) if po_utility > best_utility and (e > min(50000, train_episode_num / 2) or args.row_num == 3): best_utility = po_utility sess.run(po_good_copy_op) final_utility[1] = po_utility info = [str(po_utility)] + [str(x) for x in test_total_reward] info = 'test ' + str(e) + ' ' + '\t'.join(info) + '\n' #print('reward is: ', info) print('po ' + info) test_log.write(info) test_log.flush() test_log.close() log.close()
def cbf( rank, env, sess, env_name, seed, debug, tensorboard, idf, replay_size, # size of replay buffer batch_size, # size of minibatch n_timesteps, # number of timesteps len_rollouts, # length of each rollout n_optimizations, # number of optimization steps embedding_space_size, # size of embeddings learning_rate, # learning rate of forward dynamics joint_training=False, using_extrinsic_reward=False): # Initialize models emb = CnnEmbedding("embedding", env.observation_space, env.action_space, embedding_space_size) fd = ForwardDynamics( "forward_dynamics", embedding_space_size, env.action_space) if not using_extrinsic_reward else None idf = InverseDynamics("inverse_dynamics", env.observation_space, env.action_space, embedding_space_size, emb) if idf else None policy = Policy("policy_new", env.action_space, joint_training, emb_size=embedding_space_size, emb_network=emb) ppo = PPO( env, policy, emb_network=emb, emb_size=embedding_space_size, max_timesteps=int(n_timesteps), clip_param=0.2, entcoeff=0.001, optim_epochs=8, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', joint_training=joint_training, ) if tensorboard: merged_summary_op = tf.summary.merge_all() writer = tf.summary.FileWriter('tmp/tensorflow/', sess.graph) if not debug and rank == 0: cur_time = datetime.datetime.today().strftime('%Y_%m_%d_%H_%M_%S') directory = 'results/' + cur_time if not os.path.exists(directory): os.makedirs(directory) directory_m = 'model/' + cur_time if not os.path.exists(directory_m): os.makedirs(directory_m) txt = 'Running with env:%s, seed:%s, num timesteps:%s, joint-training:%s, using-extrinsic-reward:%s\n\n' \ % (env_name, seed, n_timesteps, joint_training, using_extrinsic_reward) txt += 'Hyperparameters:\n - replay size:%s\n - batch size:%s\n - length of rollout:%s\n - number of optimization steps:%s\n - ' \ % (replay_size, batch_size, len_rollouts, n_optimizations) txt += 'size of embedding:%s\n - learning rate of forward dynamics:%s\n\n' \ % (embedding_space_size, learning_rate) txt += 'For inference on model, run:\n' txt += 'python3 cbf.py --env %s --seed %s --joint-training %s --using-extrinsic-reward %s ' \ % (env_name, seed, joint_training, using_extrinsic_reward) txt += '--inference True --path-to-model %s' % (directory_m + '/model.ckpt') with open(directory + '/info.txt', 'w+') as txt_file: txt_file.write(txt) replay_memory = ReplayBuffer(replay_size) sess = tf.get_default_session() # sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() t = 0 i = 0 # initialize optimization batch variables a = env.action_space.sample() # not used, just so we have the datatype done = True # marks if we're on first timestep of an episode s = env.reset() cur_ep_ret = 0 # return in current episode cur_ep_len = 0 # len of current episode ep_rets = [] # returns of completed episodes in this segment ep_lens = [] # lengths of ... # Initialize history arrays if joint_training: s_arr = np.array([np.zeros([84, 84, 4]) for _ in range(len_rollouts)]) else: s_arr = np.array( [np.zeros(embedding_space_size) for _ in range(len_rollouts)]) r_arr = np.zeros(len_rollouts, 'float32') vpreds = np.zeros(len_rollouts, 'float32') dones = np.zeros(len_rollouts, 'int32') a_arr = np.array([a for _ in range(len_rollouts)]) # For graphing best_reward = -float("inf") cur_reward = 0 graph_rewards = [] graph_best_rewards = [] graph_epi_lens = [] graph_in_rewards = [] graph_avg_rewards = [] graph_avg_epi_lens = [] rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths while True: for j in range(len_rollouts): if not debug and rank == 0 and t > 0 and t % int(1e3) == 0: # print('# frame: %i. Best reward so far: %i.' % (t, best_reward,)) save_to_file(directory, env_name, graph_rewards, graph_best_rewards, graph_epi_lens, graph_in_rewards, graph_avg_rewards, graph_avg_epi_lens) save_path = saver.save(sess, directory_m + '/model.ckpt') save_path = saver.save(sess, 'model/model.ckpt') #print("Model saved in file: %s" % save_path) if tensorboard and rank == 0 and t > 0 and t % int(1e3) == 0: summary = sess.run(merged_summary_op) writer.add_summary(summary, i) s = np.array(s) obs1 = emb.embed([s]) if joint_training: a, vpred = policy.act([s]) else: a, vpred = policy.act(obs1) # update optimization batch variables idx = t % len_rollouts s_arr[idx] = s if joint_training else obs1 vpreds[idx] = vpred dones[idx] = done a_arr[idx] = a s_, ext_r, done, _ = env.step(a) cur_reward += ext_r s_ = np.array(s_) # compute intrinsic reward obs2 = emb.embed([s_]) r = fd.get_loss(obs1, obs2, np.eye(env.action_space.n) [a]) if not using_extrinsic_reward else ext_r replay_memory.add(s, a, r, s_, done) if t > 0 and t % int(2e2) == 0: graph_in_rewards.append((r, i)) # update optimization batch variables r_arr[idx] = r cur_ep_ret += r cur_ep_len += 1 # Prepare for next step if done: rewbuffer.append(cur_reward) lenbuffer.append(cur_ep_len) graph_rewards.append((cur_reward, i)) graph_epi_lens.append((cur_ep_len, i)) ep_rets.append(cur_reward) ep_lens.append(cur_ep_len) cur_ep_ret = 0 cur_ep_len = 0 if cur_reward > best_reward: best_reward = cur_reward graph_best_rewards.append((best_reward, i)) graph_avg_rewards.append((sum(rewbuffer) / len(rewbuffer), i)) graph_avg_epi_lens.append((sum(lenbuffer) / len(lenbuffer), i)) cur_reward = 0 s = env.reset() else: s = s_ t += 1 i += 1 ppo.prepare({ "ob": s_arr, "rew": r_arr, "vpred": vpreds, "new": dones, "ac": a_arr, "nextvpred": vpred * (1 - done), "ep_rets": ep_rets, "ep_lens": ep_lens }) ep_rets = [] ep_lens = [] for j in range(n_optimizations): # optimize theta_pi (and optionally theta_phi) wrt PPO loss ppo.step() # sample minibatch M from replay buffer R states, actions, rewards, next_states, _ = replay_memory.sample( batch_size) obs1, obs2 = emb.embed(states), emb.embed( next_states) # embedding of states actions_hot = np.squeeze( [np.eye(env.action_space.n)[action] for action in actions]) # optimize theta_f wtf forward dynamics loss on minibatch if not using_extrinsic_reward: fd.train(obs1, obs2, actions_hot, learning_rate) # optionally optimize theta_phi, theta_g wrt to auxilary loss if idf: idf.train(states, next_states, actions_hot, learning_rate) ppo.log() if ppo.timesteps_so_far >= n_timesteps: break i = ppo.timesteps_so_far if not debug and rank == 0: save_to_file(directory, env_name, graph_rewards, graph_best_rewards, graph_epi_lens, graph_in_rewards, graph_avg_rewards, graph_avg_epi_lens)
def calc_pa_best_response_PER(patroller, target_patroller, pa_copy_op, pa_good_copy_op, poachers, po_strategy, po_type, iteration, sess, env, args, final_utility, starting_e, train_episode_num = None, po_locations = None): ''' po_locations: if is purely global mode, then po_locations is None else, it is the local + global retrain mode. each entry of po_locations specify the local mode of that poacher. Other things are basically the same as the function 'calc_po_best_response_PER' ''' po_location = None #print('FIND_patroller_best_response iteration: ' + str(iteration)) if train_episode_num is None: train_episode_num = args.pa_episode_num decrease_time = 1.0 / args.epsilon_decrease epsilon_decrease_every = train_episode_num // decrease_time if not args.PER: replay_buffer = ReplayBuffer(args, args.pa_replay_buffer_size) else: replay_buffer = PERMemory(args) best_utility = -10000.0 test_utility = [] if starting_e == 0: log = open(args.save_path + 'pa_log_train_iter_' + str(iteration) + '.dat', 'w') test_log = open(args.save_path + 'pa_log_test_iter_' + str(iteration) + '.dat', 'w') else: log = open(args.save_path + 'pa_log_train_iter_' + str(iteration) + '.dat', 'a') test_log = open(args.save_path + 'pa_log_test_iter_' + str(iteration) + '.dat', 'a') epsilon = 1.0 learning_rate = args.po_initial_lr global_step = 0 action_id = { 'still': 0, 'up': 1, 'down': 2, 'left': 3, 'right': 4 } sess.run(pa_copy_op) for e in range(starting_e, starting_e + train_episode_num): if e > 0 and e % epsilon_decrease_every == 0: epsilon = max(0.1, epsilon - args.epsilon_decrease) if e % args.mix_every_episode == 0 or e == starting_e: po_chosen_strat = np.argmax(np.random.multinomial(1, po_strategy)) poacher = poachers[po_chosen_strat] type = po_type[po_chosen_strat] if po_locations is not None: # loacl + global mode, needs to change the poacher mode po_location = po_locations[po_chosen_strat] ### reset the environment poacher.reset_snare_num() pa_state, po_state = env.reset_game(po_location) episode_reward = 0.0 pa_action = 'still' for t in range(args.max_time): global_step += 1 ### transition records the (s,a,r,s) tuples transition = [] ### poacher chooses an action ### doing so is because heuristic and DQN agent has different infer_action API if type == 'DQN': if not env.catch_flag and not env.home_flag: # if poacher is not caught, it can still do actions po_state = np.array([po_state]) snare_flag, po_action = poacher.infer_action(sess=sess, states=po_state, policy="greedy", po_loc=env.po_loc, animal_density=env.animal_density) else: ### however, if it is caught, just make it stay still and does nothing snare_flag = 0 po_action = 'still' elif type == 'PARAM': po_loc = env.po_loc if not env.catch_flag and not env.home_flag: snare_flag, po_action = poacher.infer_action(loc=po_loc, local_trace=env.get_local_pa_trace(po_loc), local_snare=env.get_local_snare(po_loc), initial_loc=env.po_initial_loc) else: snare_flag = 0 po_action = 'still' ### transition appends the current state transition.append(pa_state) ### patroller chooses an action pa_state = np.array([pa_state]) pa_action = patroller.infer_action(sess=sess, states=pa_state, policy="epsilon_greedy", epsilon=epsilon, pa_loc=env.pa_loc, animal_density=env.animal_density) ### transition adds action transition.append(action_id[pa_action]) ### the game moves on a step. pa_state, pa_reward, po_state, _, end_game = \ env.step(pa_action, po_action, snare_flag) ### transition adds reward and the next state episode_reward += pa_reward transition.append(pa_reward) transition.append(pa_state) ### Add transition to replay buffer replay_buffer.add_transition(transition) ### Start training ### Sample a minibatch, if the replay buffer has been full if replay_buffer.size >= args.batch_size: if not args.PER: train_state, train_action, train_reward, train_new_state = \ replay_buffer.sample_batch(args.batch_size) else: train_state, train_action, train_reward,train_new_state, \ idx_batch, weight_batch = replay_buffer.sample_batch(args.batch_size) ### Double DQN get target max_index = patroller.get_max_q_index(sess=sess, states=train_new_state) max_q = target_patroller.get_q_by_index(sess=sess, states=train_new_state, index=max_index) q_target = train_reward + args.reward_gamma * max_q if args.PER: q_pred = sess.run(patroller.output, {patroller.input_state: train_state}) q_pred = q_pred[np.arange(args.batch_size), train_action] TD_error_batch = np.abs(q_target - q_pred) replay_buffer.update(idx_batch, TD_error_batch) if not args.PER: weight = np.ones(args.batch_size) else: weight = weight_batch ### Update parameter feed = { patroller.input_state: train_state, patroller.actions: train_action, patroller.q_target: q_target, patroller.learning_rate: learning_rate, patroller.weight_loss: weight } sess.run(patroller.train_op, feed_dict=feed) ### Update target network if global_step % args.target_update_every == 0: sess.run(pa_copy_op) ### game ends: 1) the patroller catches the poacher and removes all the snares; ### 2) the maximum time step is achieved if end_game or (t == args.max_time - 1): info = str(e) + "\tepisode\t%s\tlength\t%s\ttotal_reward\t%s\taverage_reward\t%s" % \ (e, t + 1, episode_reward, 1. * episode_reward / (t + 1)) if e % args.print_every == 0: log.write(info + '\n') print('pa ' + info) # log.flush() break ### save the models, and test if they are good if e > 0 and e % args.save_every_episode == 0 or e == train_episode_num - 1: save_name = args.save_path + 'iteration_' + str(iteration) + '_epoch_' + str(e) + "_pa_model.ckpt" patroller.save(sess=sess, filename=save_name) ### test the agent if e == train_episode_num - 1 or ( e > 0 and e % args.test_every_episode == 0): ### test against each strategy the poacher is using now, compute the expected utility pa_utility = 0.0 test_total_reward = np.zeros(len(po_strategy)) for po_strat in range(len(po_strategy)): if po_strategy[po_strat] > 1e-10: if po_locations is None: ### indicates the purely global mode tmp_po_location = None else: ### indicates the local + global retrain mode, needs to set poacher mode tmp_po_location = po_locations[po_strat] test_total_reward[po_strat], _, _ = test_(patroller, poachers[po_strat], \ env, sess,args, iteration, e, patroller_type='DQN', poacher_type=po_type[po_strat], po_location=tmp_po_location) ### update the expected utility pa_utility += po_strategy[po_strat] * test_total_reward[po_strat] test_utility.append(pa_utility) if pa_utility > best_utility and (e > min(50000, train_episode_num / 2) or args.row_num == 3): best_utility = pa_utility sess.run(pa_good_copy_op) final_utility[0] = pa_utility info = [str(pa_utility)] + [str(x) for x in test_total_reward] info = 'test ' + str(e) + ' ' + '\t'.join(info) + '\n' #print('reward is: ', info) print('pa ' + info) test_log.write(info) test_log.flush() test_log.close() log.close()
def train(env, model, max_steps, name, logdir, logger): target_model = create_model(env) replay = ReplayBuffer(REPLAY_BUFFER_SIZE) done = True episode = 0 steps_after_logging = 0 loss = 0.0 for step in range(1, max_steps + 1): try: if step % SNAPSHOT_EVERY == 0: save_model(model, step, logdir, name) if done: if episode > 0: if steps_after_logging >= LOG_EVERY: steps_after_logging = 0 episode_end = time() episode_seconds = episode_end - episode_start episode_steps = step - episode_start_step steps_per_second = episode_steps / episode_seconds memory = psutil.virtual_memory() to_gb = lambda in_bytes: in_bytes / 1024 / 1024 / 1024 print("episode {} " "steps {}/{} " "loss {:.7f} " "return {} " "in {:.2f}s " "{:.1f} steps/s " "{:.1f}/{:.1f} GB RAM".format( episode, episode_steps, step, loss, episode_return, episode_seconds, steps_per_second, to_gb(memory.used), to_gb(memory.total), )) logger.log_scalar('episode_return', episode_return, step) logger.log_scalar('episode_steps', episode_steps, step) logger.log_scalar('episode_seconds', episode_seconds, step) logger.log_scalar('steps_per_second', steps_per_second, step) logger.log_scalar('memory_used', to_gb(memory.used), step) logger.log_scalar('loss', loss, step) episode_start = time() episode_start_step = step obs = env.reset() episode += 1 episode_return = 0.0 else: obs = next_obs action = epsilon_greedy_action(env, model, obs, epsilon=TRAIN_EPSILON) next_obs, reward, done, _ = env.step(action) episode_return += reward replay.add(obs, action, reward, next_obs, done) if step >= TRAIN_START: if step % TARGET_UPDATE_EVERY == 0: target_model.set_weights(model.get_weights()) batch = replay.sample(BATCH_SIZE) loss = fit_batch(env, model, target_model, batch) if step == Q_VALIDATION_SIZE: q_validation_observations, _, _, _, _ = replay.sample( Q_VALIDATION_SIZE) if step >= TRAIN_START and step % EVAL_EVERY == 0: episode_return_avg = evaluate(env, model) q_values = predict(env, model, q_validation_observations) max_q_values = np.max(q_values, axis=1) avg_max_q_value = np.mean(max_q_values) print("episode {} " "step {} " "episode_return_avg {:.3f} " "avg_max_q_value {:.3f}".format( episode, step, episode_return_avg, avg_max_q_value, )) logger.log_scalar('episode_return_avg', episode_return_avg, step) logger.log_scalar('avg_max_q_value', avg_max_q_value, step) steps_after_logging += 1 except KeyboardInterrupt: save_model(model, step, logdir, name) break
def main(): # env = envstandalone.BallCatch() env = envstandalone.TestRob3Env() max_timesteps=40000 learning_starts=1000 buffer_size=50000 # buffer_size=1000 exploration_fraction=0.2 exploration_final_eps=0.02 print_freq=10 gamma=.98 target_network_update_freq=500 learning_alpha = 0.2 batch_size=32 train_freq=1 obsShape = (8,8,1) deicticShape = (3,3,1) num_deictic_patches=36 num_actions = 4 episode_rewards = [0.0] num_cpu=16 # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # same as getDeictic except this one just calculates for the observation # input: n x n x channels # output: dn x dn x channels def getDeicticObs(obs): windowLen = deicticShape[0] deicticObs = [] for i in range(np.shape(obs)[0] - windowLen + 1): for j in range(np.shape(obs)[1] - windowLen + 1): deicticObs.append(obs[i:i+windowLen,j:j+windowLen,:]) return np.array(deicticObs) # conv model parameters: (num_outputs, kernel_size, stride) model = models.cnn_to_mlp( # convs=[(16,3,1)], convs=[(16,2,1)], # convs=[(32,3,1)], hiddens=[16], # hiddens=[64], # dueling=True dueling=False ) q_func=model # lr=1e-3 lr=0.001 def make_obs_ph(name): # return U.BatchInput(deicticShape, name=name) return U.BatchInput(obsShape, name=name) def make_target_ph(name): return U.BatchInput([num_actions], name=name) sess = U.make_session(num_cpu) sess.__enter__() getq, targetTrain = build_graph.build_train_nodouble( make_obs_ph=make_obs_ph, make_target_ph=make_target_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), grad_norm_clipping=10, double_q=False ) # Initialize the parameters and copy them to the target network. U.initialize() replay_buffer = ReplayBuffer(buffer_size) obs = env.reset() timerStart = time.time() for t in range(max_timesteps): # Get current q-values: neural network version qCurr = getq(np.array([obs])) # select action qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly action = np.argmax(qCurrNoise,1) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs, action, rew, new_obs, float(done)) # # debug # if t > 5000: # print("obs:\n" + str(np.squeeze(obs))) # print("qCurr:\n" + str(qCurr)) # print("action: " + str(action) + ", patch: " + str(selPatch)) # print("close:\n" + str(obsDeictic[selPatch,:,:,0] + obsDeictic[selPatch,:,:,1])) # print("far:\n" + str(obsDeictic[selPatch,:,:,2] + obsDeictic[selPatch,:,:,3])) # action # sample from replay buffer and train if t > learning_starts and t % train_freq == 0: # Sample from replay buffer obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) actions = np.int32(np.reshape(actions,[batch_size,])) # Get curr, next values: neural network version qNext = getq(obses_tp1) qCurr = getq(obses_t) # Get targets qNextmax = np.max(qNext,1) targets = rewards + (1-dones) * gamma * qNextmax qCurrTargets = np.zeros(np.shape(qCurr)) for i in range(num_actions): myActions = actions == i qCurrTargets[:,i] = myActions * targets + (1 - myActions) * qCurr[:,i] # Update values: neural network version td_error_out, obses_out, targets_out = targetTrain( obses_t, qCurrTargets ) td_error_pre = qCurr[range(batch_size),actions] - targets # print("td error pre-update: " + str(np.linalg.norm(td_error_pre))) # neural network version qCurr = getq(obses_t) td_error_post = qCurr[range(batch_size),actions] - targets # print("td error post-update: " + str(np.linalg.norm(td_error_post))) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: # print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", max q at curr state: " + str(np.max(qCurr))) timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs
def run_for_config(config, print_messages): # set the name of the model model_name = config['general']['name'] now = datetime.datetime.fromtimestamp( time.time()).strftime('%Y_%m_%d_%H_%M_%S') model_name = now + '_' + model_name if model_name is not None else now # openrave_interface = OpenraveRLInterface(config, None) random_seed = config['general']['random_seed'] np.random.seed(random_seed) random.seed(random_seed) tf.set_random_seed(random_seed) # where we save all the outputs (outputs will be saved according to the scenario) scenario = config['general']['scenario'] working_dir = os.path.join(get_base_directory(), scenario) if not os.path.exists(working_dir): os.makedirs(working_dir) saver_dir = os.path.join(working_dir, 'models', model_name) if not os.path.exists(saver_dir): os.makedirs(saver_dir) best_model_path = None config_copy_path = os.path.join(working_dir, 'models', model_name, 'config.yml') summaries_dir = os.path.join(working_dir, 'tensorboard', model_name) completed_trajectories_dir = os.path.join(working_dir, 'trajectories', model_name) # load images if required image_cache = None if _is_vision(scenario): image_cache = ImageCache(config['general']['params_file'], create_images=True) # load pretrained model if required pre_trained_reward = None if config['model']['use_reward_model']: reward_model_name = config['model']['reward_model_name'] pre_trained_reward = PreTrainedReward(reward_model_name, config) # generate graph: network = Network(config, is_rollout_agent=False, pre_trained_reward=pre_trained_reward) def unpack_state_batch(state_batch): joints = [state[0] for state in state_batch] poses = { p.tuple: [state[1][p.tuple] for state in state_batch] for p in network.potential_points } jacobians = None return joints, poses, jacobians def score_for_hindsight(augmented_buffer): assert _is_vision(scenario) # unzip goal_pose_list, goal_joints_list, workspace_image_list, current_state_list, action_used_list, _, is_goal_list,\ __ = zip(*augmented_buffer) # make one hot status vector: is_goal_one_hot_list = np.zeros((len(is_goal_list), 3), dtype=np.float32) for i in range(len(is_goal_list)): if is_goal_list[i]: is_goal_one_hot_list[i, 2] = 1.0 # mark as goal transition else: is_goal_one_hot_list[i, 0] = 1.0 # mark as free transition # unpack current and next state current_joints, _, __ = unpack_state_batch(current_state_list) fake_rewards, _ = pre_trained_reward.make_prediction( sess, current_joints, goal_joints_list, action_used_list, goal_pose_list, all_transition_labels=is_goal_one_hot_list) return list(fake_rewards) # initialize replay memory replay_buffer = ReplayBuffer(config) hindsight_policy = HindsightPolicy(config, replay_buffer, score_for_hindsight) # save model latest_saver = tf.train.Saver(max_to_keep=2, save_relative_paths=saver_dir) best_saver = tf.train.Saver(max_to_keep=2, save_relative_paths=saver_dir) yaml.dump(config, open(config_copy_path, 'w')) summaries_collector = SummariesCollector(summaries_dir, model_name) rollout_manager = FixedRolloutManager(config, image_cache=image_cache) trajectory_eval = TrajectoryEval(config, rollout_manager, completed_trajectories_dir) test_results = [] def update_model(sess, global_step): batch_size = config['model']['batch_size'] gamma = config['model']['gamma'] replay_buffer_batch = replay_buffer.sample_batch(batch_size) goal_pose, goal_joints, workspace_id, current_state, action, reward, terminated, next_state = \ replay_buffer_batch # get image from image cache workspace_image = None if image_cache is not None: workspace_image = [image_cache.get_image(k) for k in workspace_id] current_joints, _, __ = unpack_state_batch(current_state) next_joints, _, __ = unpack_state_batch(next_state) # get the predicted q value of the next state (action is taken from the target policy) next_state_action_target_q = network.predict_policy_q( next_joints, workspace_image, goal_pose, goal_joints, sess, use_online_network=False) # compute critic label q_label = np.expand_dims( np.squeeze(np.array(reward)) + np.multiply(np.multiply(1 - np.array(terminated), gamma), np.squeeze(next_state_action_target_q)), 1) max_label = np.max(q_label) min_label = np.min(q_label) limit = 1.0 / (1.0 - gamma) if max_label > limit: print 'out of range max label: {} limit: {}'.format( max_label, limit) if min_label < -limit: print 'out of range min label: {} limit: {}'.format( min_label, limit) # # step to use for debug: # network.debug_all(current_joints, workspace_image, goal_pose, goal_joints, action, q_label, sess) # train critic given the targets critic_optimization_summaries, _ = network.train_critic( current_joints, workspace_image, goal_pose, goal_joints, action, q_label, sess) # train actor actor_optimization_summaries, _ = network.train_actor( current_joints, workspace_image, goal_pose, goal_joints, sess) # update target networks network.update_target_networks(sess) result = [ critic_optimization_summaries, actor_optimization_summaries, ] return result def print_state(prefix, episodes, successful_episodes, collision_episodes, max_len_episodes): if not print_messages: return print '{}: {}: finished: {}, successful: {} ({}), collision: {} ({}), max length: {} ({})'.format( datetime.datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), prefix, episodes, successful_episodes, float(successful_episodes) / episodes, collision_episodes, float(collision_episodes) / episodes, max_len_episodes, float(max_len_episodes) / episodes) def process_example_trajectory(episode_example_trajectory, episode_agent_trajectory): # creates an episode by computing the actions, and setting the rewards to None (will be calculated later) _, __, ___, ____, goal_pose, goal_joints, workspace_id = episode_agent_trajectory example_trajectory, example_trajectory_poses = episode_example_trajectory example_trajectory = [j[1:] for j in example_trajectory] # goal reached always status = 3 # get the states (joints, poses, jacobians), for now, ignore the jacobians. states = [(example_trajectory[i], example_trajectory_poses[i], None) for i in range(len(example_trajectory))] # compute the actions by normalized difference between steps actions = [ np.array(example_trajectory[i + 1]) - np.array(example_trajectory[i]) for i in range(len(example_trajectory) - 1) ] actions = [a / max(np.linalg.norm(a), 0.00001) for a in actions] rewards = [-config['openrave_rl']['keep_alive_penalty'] ] * (len(actions) - 1) + [1.0] return status, states, actions, rewards, goal_pose, goal_joints, workspace_id def do_test(sess, best_model_global_step, best_model_test_success_rate): rollout_manager.set_policy_weights(network.get_actor_weights( sess, is_online=False), is_online=False) eval_result = trajectory_eval.eval( global_step, config['test']['number_of_episodes']) test_episodes = eval_result[0] test_successful_episodes = eval_result[1] test_collision_episodes = eval_result[2] test_max_len_episodes = eval_result[3] test_mean_reward = eval_result[4] if print_messages: print_state('test', test_episodes, test_successful_episodes, test_collision_episodes, test_max_len_episodes) print('test mean total reward {}'.format(test_mean_reward)) summaries_collector.write_test_episode_summaries( sess, global_step, test_episodes, test_successful_episodes, test_collision_episodes, test_max_len_episodes) test_results.append( (global_step, episodes, test_successful_episodes, test_collision_episodes, test_max_len_episodes, test_mean_reward)) # see if best rate = test_successful_episodes / float(test_episodes) if best_model_test_success_rate < rate: if print_messages: print 'new best model found at step {}'.format(global_step) print 'old success rate {} new success rate {}'.format( best_model_test_success_rate, rate) is_best = True best_model_global_step = global_step best_model_test_success_rate = rate else: is_best = False if print_messages: print 'best model still at step {}'.format( best_model_global_step) return is_best, best_model_global_step, best_model_test_success_rate def do_end_of_run_validation(sess): # restores the model first best_saver.restore(sess, best_model_path) # set the weights rollout_manager.set_policy_weights(network.get_actor_weights( sess, is_online=False), is_online=False) eval_result = trajectory_eval.eval( -1, config['validation']['number_of_episodes']) test_episodes = eval_result[0] test_successful_episodes = eval_result[1] test_collision_episodes = eval_result[2] test_max_len_episodes = eval_result[3] test_mean_reward = eval_result[4] if print_messages: print_state('validation (best model)', test_episodes, test_successful_episodes, test_collision_episodes, test_max_len_episodes) print('validation (best model) mean total reward {}'.format( test_mean_reward)) test_results.append( (-1, episodes, test_successful_episodes, test_collision_episodes, test_max_len_episodes, test_mean_reward)) # see if best rate = test_successful_episodes / float(test_episodes) print 'final success rate is {}'.format(rate) return rate allowed_batch_episode_editor = config['model']['batch_size'] if _is_vision( scenario) else None regular_episode_editor = EpisodeEditor( config['model']['alter_episode'], pre_trained_reward, image_cache=image_cache, allowed_batch=allowed_batch_episode_editor) motion_planner_episode_editor = EpisodeEditor( config['model']['alter_episode_expert'], pre_trained_reward, image_cache=image_cache, allowed_batch=allowed_batch_episode_editor) with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions( per_process_gpu_memory_fraction=config['general'] ['gpu_usage']))) as sess: sess.run(tf.global_variables_initializer()) if pre_trained_reward is not None: pre_trained_reward.load_weights(sess) network.update_target_networks(sess) global_step = 0 episodes = successful_episodes = collision_episodes = max_len_episodes = 0 best_model_global_step, best_model_test_success_rate = -1, -1.0 for update_index in range(config['general']['updates_cycle_count']): # collect data a = datetime.datetime.now() rollout_manager.set_policy_weights(network.get_actor_weights( sess, is_online=True), is_online=True) episodes_per_update = config['general']['episodes_per_update'] episode_results = rollout_manager.generate_episodes( episodes_per_update, True) episodes_agent_trajectory, episodes_times, episodes_example_trajectory = zip( *episode_results) # alter the episodes based on reward model altered_episodes = regular_episode_editor.process_episodes( episodes_agent_trajectory, sess) # process example episodes for failed interactions altered_motion_planner_episodes = [] failed_motion_planner_trajectories = config['model'][ 'failed_motion_planner_trajectories'] if failed_motion_planner_trajectories > 0: # take a small number of failed motion plans failed_episodes_indices = [ i for i in range(len(altered_episodes)) if altered_episodes[i][0] != 3 ] failed_episodes_indices = failed_episodes_indices[: failed_motion_planner_trajectories] motion_planner_episodes = [ process_example_trajectory(episodes_example_trajectory[i], altered_episodes[i]) for i in failed_episodes_indices ] altered_motion_planner_episodes = motion_planner_episode_editor.process_episodes( motion_planner_episodes, sess) # add to replay buffer hindsight_policy.append_to_replay_buffer( list(altered_episodes) + list(altered_motion_planner_episodes)) # compute times total_find_trajectory_time = None total_rollout_time = None for episode_times in episodes_times: # update the times find_trajectory_time, rollout_time = episode_times if total_find_trajectory_time is None: total_find_trajectory_time = find_trajectory_time else: total_find_trajectory_time += find_trajectory_time if total_rollout_time is None: total_rollout_time = rollout_time else: total_rollout_time += rollout_time # compute counters for altered_episode in altered_episodes: status = altered_episode[0] episodes += 1 if status == 1: max_len_episodes += 1 elif status == 2: collision_episodes += 1 elif status == 3: successful_episodes += 1 b = datetime.datetime.now() print 'data collection took: {}'.format(b - a) print 'find trajectory took: {}'.format(total_find_trajectory_time) print 'rollout time took: {}'.format(total_rollout_time) print_state('train', episodes, successful_episodes, collision_episodes, max_len_episodes) # do updates if replay_buffer.size() > config['model']['batch_size']: a = datetime.datetime.now() for _ in range(config['general']['model_updates_per_cycle']): summaries = update_model(sess, global_step) if global_step % config['general'][ 'write_train_summaries'] == 0: summaries_collector.write_train_episode_summaries( sess, global_step, episodes, successful_episodes, collision_episodes, max_len_episodes) summaries_collector.write_train_optimization_summaries( summaries, global_step) global_step += 1 b = datetime.datetime.now() print 'update took: {}'.format(b - a) # test if needed if update_index % config['test']['test_every_cycles'] == 0: is_best, best_model_global_step, best_model_test_success_rate = do_test( sess, best_model_global_step, best_model_test_success_rate) if is_best: best_model_path = best_saver.save(sess, os.path.join( saver_dir, 'best'), global_step=global_step) if update_index % config['general']['save_model_every_cycles'] == 0: latest_saver.save(sess, os.path.join(saver_dir, 'last_iteration'), global_step=global_step) # see if max score reached (even if validation is not 100%, there will no longer be any model updates...) if best_model_test_success_rate > 0.99999: print 'stoping run: best test success rate reached {}'.format( best_model_test_success_rate) break # final test at the end is_best, best_model_global_step, best_model_test_success_rate = do_test( sess, best_model_global_step, best_model_test_success_rate) if is_best: best_model_path = best_saver.save(sess, os.path.join(saver_dir, 'best'), global_step=global_step) # get a validation rate for the best recorded model validation_rate = do_end_of_run_validation(sess) last_message = 'best model stats at step {} has success rate of {} and validation success rate of {}'.format( best_model_global_step, best_model_test_success_rate, validation_rate) print last_message with open(os.path.join(completed_trajectories_dir, 'final_status.txt'), 'w') as f: f.write(last_message) f.flush() test_results_file = os.path.join(completed_trajectories_dir, 'test_results.test_results_pkl') with bz2.BZ2File(test_results_file, 'w') as compressed_file: pickle.dump(test_results, compressed_file) rollout_manager.end() return test_results
def main(_): """Run td3/ddpg training.""" contrib_eager_python_tfe.enable_eager_execution() if FLAGS.use_gpu: tf.device('/device:GPU:0').__enter__() if FLAGS.expert_dir.find(FLAGS.env) == -1: raise ValueError('Expert directory must contain the environment name') tf.set_random_seed(FLAGS.seed) np.random.seed(FLAGS.seed) random.seed(FLAGS.seed) env = gym.make(FLAGS.env) env.seed(FLAGS.seed) obs_shape = env.observation_space.shape act_shape = env.action_space.shape expert_replay_buffer_var = contrib_eager_python_tfe.Variable( '', name='expert_replay_buffer') saver = contrib_eager_python_tfe.Saver([expert_replay_buffer_var]) tf.gfile.MakeDirs(FLAGS.save_dir) with tf.variable_scope('actor'): actor = Actor(obs_shape[0], act_shape[0]) expert_saver = contrib_eager_python_tfe.Saver(actor.variables) best_checkpoint = None best_reward = float('-inf') checkpoint_state = tf.train.get_checkpoint_state(FLAGS.expert_dir) for checkpoint in checkpoint_state.all_model_checkpoint_paths: expert_saver.restore(checkpoint) expert_reward, _ = do_rollout(env, actor, replay_buffer=None, noise_scale=0.0, num_trajectories=10) if expert_reward > best_reward: best_reward = expert_reward best_checkpoint = checkpoint expert_saver.restore(best_checkpoint) expert_replay_buffer = ReplayBuffer() expert_reward, _ = do_rollout( env, actor, replay_buffer=expert_replay_buffer, noise_scale=0.0, num_trajectories=FLAGS.num_expert_trajectories) logging.info('Expert reward %f', expert_reward) print('Expert reward {}'.format(expert_reward)) expert_replay_buffer_var.assign(pickle.dumps(expert_replay_buffer)) saver.save(os.path.join(FLAGS.save_dir, 'expert_replay_buffer'))