def get_demo_greedy(env, file_name): demo_buffer = deque() demo = [] demo_sas = [] REWARDS, REWARD100, reward100 = [], [], 0 state = env.reset() vision = state['vision'] n_tongs = env._agent._items[1] # number of tongs carried q_value = value_iteration(vision, n_tongs) action = np.argmax(q_value[5, 5, 0] + 1e-2 * np.random.rand(3)) # add noise to break ties state = trans_state(state) print(DQfDConfig.demo_buffer_size) for steps in range(DQfDConfig.demo_buffer_size): next_state, reward, done, _ = env.step(action) reward100 += reward REWARDS.append(reward) vision = next_state['vision'] n_tongs = env._agent._items[1] q_value = expert_VI(vision, n_tongs) action = np.argmax(q_value[5, 5, 0] + 1e-2 * np.random.rand(3)) next_state = trans_state(next_state) demo.append([state, action, reward, next_state, done, 1.0]) state = next_state if (steps + 1) % DQfDConfig.eps_gap == 0: print("demo - steps: {} reward100: {}".format(steps, reward100)) REWARD100.append(reward100) reward100 = 0 if (steps + 1) % (DQfDConfig.eps_gap * 10) == 0: with open(file_name + 'REWARD100.p', 'wb') as f: pickle.dump(REWARD100, f, protocol=2) with open(file_name + 'REWARD100.txt', 'wb') as f: f.write(str(REWARD100)) with open(file_name + 'REWARDS.p', 'wb') as f: pickle.dump(REWARDS, f, protocol=2) with open(file_name + 'REWARDS.txt', 'wb') as f: f.write(str(REWARDS)) plot(1, REWARDS, file_name) with open(file_name + 'REWARD100.p', 'wb') as f: pickle.dump(REWARD100, f, protocol=2) with open(file_name + 'REWARD100.txt', 'wb') as f: f.write(str(REWARD100)) with open(file_name + 'REWARDS.p', 'wb') as f: pickle.dump(REWARDS, f, protocol=2) with open(file_name + 'REWARDS.txt', 'wb') as f: f.write(str(REWARDS)) plot(1, REWARDS, file_name) demo = set_n_step(demo, DQfDConfig.trajectory_n) demo_buffer.extend(demo) with open(file_name + 'demo.txt', "w") as file: file.write(str(demo_buffer)) with open(file_name + 'demo.p', 'wb') as f: pickle.dump(demo_buffer, f, protocol=2)
def main(filename): with open(filename, 'r') as f: rewards = f.read().splitlines() rewards = [float(r) for r in rewards] #rew = [0]*len(rewards)*100 #for i, r in enumerate(rewards): # rew[(i+1)*100 - 1] = r*100 plot('./', rewards[:50000])
def plot_train_stats(self): for tr in self.train_rewards[-1000:]: self.train_file.write(str(tr)) self.train_file.write('\n') self.train_file.flush() if not self.test and self.train_rewards[-1] > 0: self.net.A.save("checkpoint.pth") self.net.Ensemble.save() if self.train_rewards[-1] > 0: print('[%d] Train Reward: %.4f' % (len(self.train_rewards), self.train_rewards[-1])) self.steps = 0 x = list(range(len(self.train_rewards))) plt.plot(x, self.train_rewards, '-bo') plt.xlabel('Time') plt.ylabel('Average Reward') plt.title('Training Curve') plt.savefig(self.dump_dir + 'Training_Curve_' + self.method + '.png') plt.close() plot(self.dump_dir + self.method, self.train_rewards)
def plot_train_stats(self): self.cum_reward = self.cum_reward / float(self.log_time) self.train_rewards.append(self.cum_reward) self.train_file.write(str(self.cum_reward)) self.train_file.write('\n') self.train_file.flush() self.cum_reward = 0.0 if self.train_rewards[-1] > 0: self.net.A.save("checkpoint.pth") print('[%d] Train Reward: %.4f' % (len(self.train_rewards), self.train_rewards[-1])) self.steps = 0 x = list(range(len(self.train_rewards))) plt.plot(x, self.train_rewards, '-bo') plt.xlabel('Time') plt.ylabel('Average Reward') plt.title('Training Curve') plt.savefig(self.dump_dir + 'Training_Curve_' + self.method + '.png') plt.close() plot(self.dump_dir + self.method, self.train_rewards)
def run_DQfD(index, env, file_demo, file_name): with open(file_demo + 'demo.p', 'rb') as f: demo_transitions = pickle.load(f) demo_transitions = deque( itertools.islice(demo_transitions, 0, DQfDConfig.demo_buffer_size)) assert len(demo_transitions) == DQfDConfig.demo_buffer_size with tf.variable_scope('DQfD_' + str(index)): agent = DQfD(env, DQfDConfig(), demo_transitions=demo_transitions) agent.pre_train() # use the demo data to pre-train network REWARDS, REWARD100, episode, replay_full_episode = [], [], 0, None reward100, n_step_reward, state = 0, None, env.reset() state = trans_state(state) t_q = deque(maxlen=DQfDConfig.trajectory_n) for steps in range(DQfDConfig.episode): action = agent.egreedy_action(state) # e-greedy action for train next_state, reward, done, _ = env.step(action) next_state = trans_state(next_state) reward100 += reward REWARDS.append(reward) t_q.append([state, action, reward, next_state, done, 0.0]) # record the earliest reward for the sub-sequence if len(t_q) < t_q.maxlen: reward_to_sub = 0. else: reward_to_sub = t_q[0][2] if n_step_reward is None: # only compute once when t_q first filled n_step_reward = sum( [t[2] * DQfDConfig.GAMMA**i for i, t in enumerate(t_q)]) else: n_step_reward = (n_step_reward - reward_to_sub) / DQfDConfig.GAMMA n_step_reward += reward * DQfDConfig.GAMMA**( DQfDConfig.trajectory_n - 1) t_q[0].extend([n_step_reward, next_state, done, t_q.maxlen]) # actual_n is max_len here update_eps = True if (steps + 1) % DQfDConfig.eps_gap == 0 else False agent.perceive(t_q[0], update_eps=update_eps ) # perceive when a transition is completed if (steps + 1) % DQfDConfig.UPDATE_ESTIMATE_NET == 0: agent.train_Q_network( update=False) # train along with generation replay_full_episode = replay_full_episode or episode state = next_state if (steps + 1) % DQfDConfig.UPDATE_TARGET_NET == 0: if agent.replay_memory.full(): agent.sess.run(agent.update_target_net) if (steps + 1) % DQfDConfig.eps_gap == 0: episode += 1 if replay_full_episode is not None: print( "episode: {} trained-episode: {} reward100: {} memory length: {} epsilon: {}" .format(episode, episode - replay_full_episode, reward100, len(agent.replay_memory), agent.epsilon)) REWARD100.append(reward100) reward100 = 0 if (steps + 1) % (DQfDConfig.eps_gap * 100) == 0: with open(file_name + 'REWARD100.p', 'wb') as f: pickle.dump(REWARD100, f, protocol=2) with open(file_name + 'REWARD100.txt', 'wb') as f: f.write(str(REWARD100)) with open(file_name + 'REWARDS.p', 'wb') as f: pickle.dump(REWARDS, f, protocol=2) with open(file_name + 'REWARDS.txt', 'wb') as f: f.write(str(REWARDS)) plot(1, REWARDS, file_name) with open(file_name + 'REWARD100.p', 'wb') as f: pickle.dump(REWARD100, f, protocol=2) with open(file_name + 'REWARD100.txt', 'wb') as f: f.write(str(REWARD100)) with open(file_name + 'REWARDS.p', 'wb') as f: pickle.dump(REWARDS, f, protocol=2) with open(file_name + 'REWARDS.txt', 'wb') as f: f.write(str(REWARDS)) plot(1, REWARDS, file_name)
return q_value def run_policy(env): env.reset() observation, _, _, _ = env.step(0) reward_list = [] for step in range(1000000): # env.render() vision = observation['vision'] n_tongs = env._agent._items[1] # number of tongs carried q_value = value_iteration(vision, n_tongs) action = np.argmax(q_value[5, 5, 0] + 1e-2 * np.random.rand(3)) # add noise to break ties observation, reward, _, _ = env.step(action) reward_list.append(reward) if step % 100 == 0: print('step = {:d}'.format(step)) if (step + 1) % 10000 == 0: num = (step + 1) // 10000 with open('greedy_{:d}.pickle'.format(num), 'wb') as f: pickle.dump(reward_list, f) return reward_list if __name__ == '__main__': env = gym.make('NEL-v0') # env_r = gym.make('NEL-render-v0') reward_list = run_policy(env) canonical_plot.plot('greedy', reward_list)
if __name__ == '__main__': # Setting the session to allow growth, so it doesn't allocate all GPU memory. gpu_ops = tf.GPUOptions(allow_growth=True) config = tf.ConfigProto(gpu_options=gpu_ops) sess = tf.Session(config=config) # Setting this as the default tensorflow session. keras.backend.tensorflow_backend.set_session(sess) # Gather commandline args args = parse_arguments() environment_name = args.env model_name = args.model_name agent = Deep_Agent(environment_name, model_name, num_episodes=num_episodes, curve_episodes=1000) agent.burn_in_memory() training_step_rewards = agent.train(n_steps_to_reset=100000000000000000) plot("tsr", training_step_rewards) rewards = agent.get_rewards(10000) plot("pr", rewards) filename = "rewards_file" with open(str(filename) + ".pkl", 'wb') as f: # Python 3: open(..., 'wb') pickle.dump([training_step_rewards, rewards], f, protocol=2) # u, std = agent.test_stats(10,100) # 6.e # agent.performance_curves_from_weight_files(10, 100) # agent.plots()
rewards = [] state = env.reset() state = trans_state(state) # state = np.expand_dims(state,0) for step in range(n_steps): # env.render() action = np.rint( yhat.eval(feed_dict={x: state[None, :]})[0][0]).astype(int) # print action next_state, reward, done, info = env.step(action) next_state = trans_state(next_state) # next_state = np.expand_dims(next_state,0) state = next_state rewards.append(reward) dagger_results = { 'means': save_mean, 'stds': save_std, 'train_size': save_train_size, 'expert_mean': save_expert_mean, 'expert_std': save_expert_std } print 'DAgger iterations finished!' print dagger_results #plot("dagger_tr",tr_rewards) plot("dagger_pr", rewards) filename = "dagger_rewards_file" with open(str(filename) + ".pkl", 'wb') as f: # Python 3: open(..., 'wb') pickle.dump([rewards, dagger_results], f, protocol=2)
def train(self): train_rewards = [] test_rewards = [] count = 0 steps = 0 test_steps = 0 cum_reward = 0.0 elapsed = 0.0 curr_state = self.env.reset() curr_state = self.burn_in_memory(curr_state) prev_action = -1 if self.render: self.env.render() for i in range(self.training_time): # Get q_values based on the current state Vt, St = self.get_input_tensor(curr_state) q_values = self.net.get_Q_output(Vt, St) # Selecting an action based on the policy action = self.epsilon_greedy_policy(q_values, self.epsilon) #if not curr_state['moved'] and action == prev_action and self.epsilon > 0.1: # action = self.epsilon_greedy_policy(q_values, 0.5) # Executing action in simulator nextstate, reward, _, _ = self.env.step(action) steps = steps + 1 test_steps = test_steps + 1 if self.render: self.env.render() # Store Transition if nextstate['moved'] or prev_action != action: self.replay_buffer.add(curr_state, action, reward / 100.0, nextstate, 0) prev_action = action # Sample random minibatch from experience replay if self.prioritized_replay: batch, weights, batch_idxes = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(i)) else: batch = self.replay_buffer.sample(self.batch_size) weights, batch_idxes = np.ones(self.batch_size), None # Train the Network with mini batches xVT, xST = self.get_input_tensors(batch) yT = self.get_output_tensors(batch) # Mask to select the actions from the Q network output mT = torch.zeros(self.batch_size, self.an, dtype=torch.uint8) for k, tran in enumerate(batch): mT[k, tran[1]] = 1 td_errors = self.net.train(xVT, xST, yT, mT, weights) if self.prioritized_replay: #new_priorities = np.abs(td_errors) + self.prioritized_replay_eps #new_priorities = [] #for i, tran in enumerate(batch): # new_priorities.append(tran[2] + self.prioritized_replay_eps) self.replay_buffer.update_priorities(batch_idxes, weights) # Decay epsilon self.update_epsilon() cum_reward += reward curr_state = nextstate if steps == 100: cum_reward = cum_reward / float(self.log_time) train_rewards.append(cum_reward) self.train_file.write(str(cum_reward)) self.train_file.write('\n') self.train_file.flush() cum_reward = 0.0 print('Train Reward: %.4f' % (train_rewards[-1])) steps = 0 x = list(range(len(train_rewards))) plt.plot(x, train_rewards, '-bo') plt.xlabel('Time') plt.ylabel('Average Reward') plt.title('Training Curve') plt.savefig(self.dump_dir + 'Training_Curve_' + self.method + '.png') plt.close() plot(self.dump_dir + self.method, train_rewards) # if test_steps == 500: # self.net.set_eval() # test_rewards.append(self.test()) # self.test_file.write(str(test_rewards[-1])) # self.test_file.write('\n') # self.test_file.flush() # self.net.set_train() # count = count + 1 # print('\nTest Reward: %.4f\n' % (test_rewards[-1])) # test_steps = 0 # # x = list(range(len(test_rewards))) # plt.plot(x, test_rewards, '-bo') # plt.xlabel('Time') # plt.ylabel('Average Reward') # plt.title('Testing Curve') # plt.savefig(self.dump_dir + 'Testing_Curve_' + self.method + '.png') # plt.close() if count > 0 and count % 30 == 0: self.net.save_model_weights(count, self.dump_dir)