def play_deterministic(self, n_tot): self.model.eval() env = Env() render = args.render n_human = 60 humans_trajectories = iter(self.data) reverse_excitation_index = consts.reverse_excitation_index for i in range(n_tot): env.reset() observation = next(humans_trajectories) print("Observation %s" % observation) trajectory = self.data[observation] j = 0 ims = [] # fig = plt.figure() while not env.t: if j < n_human: a = trajectory[j, self.meta['action']] else: # im = plt.imshow(np.rollaxis(env.s.numpy().squeeze(0)[:3], 0, 3), animated=True) # ims.append([im]) if self.cuda: s = Variable(env.s.cuda(), requires_grad=False) else: s = Variable(env.s, requires_grad=False) _, _, beta, _, _, _ = self.model(s) beta = beta.squeeze(0) beta = beta.sign().int() * (beta.abs() > 0.5).int() a = reverse_excitation_index[tuple(beta.data)] env.step(a) j += 1 # if render: # ani = animation.ArtistAnimation(fig, ims, interval=10, blit=True, # repeat=False) # plt.show() yield env.score
def play_episode_deterministic(self, n_tot): self.model.eval() env = Env() n_human = 300 humans_trajectories = iter(self.data) reverse_excitation_index = consts.reverse_excitation_index for i in range(n_tot): env.reset() observation = next(humans_trajectories) trajectory = self.data[observation] j = 0 while not env.t: s = Variable(env.s.cuda(), requires_grad=False) v, q, beta, r, p, phi = self.model(s) beta = beta.squeeze(0) if j < n_human: a = trajectory[j, self.meta['action']] else: beta_index = (beta.sign().int() * (beta.abs() > 0.5).int()).data.cpu().numpy() beta_index[0] = abs(beta_index[0]) a = reverse_excitation_index[tuple(beta_index.data)] env.step(a) # x = phi.squeeze(0).data.cpu().numpy() # print(np.mean(abs(x))) # yield v, q, beta, r, p, s yield { 'o': env.s.cpu().numpy(), 'v': v.data.cpu().numpy(), 's': phi.data.cpu().numpy(), 'score': env.score, 'beta': beta.data.cpu().numpy(), 'phi': phi.squeeze(0).data.cpu().numpy() } j += 1 raise StopIteration
def cartpole(): env = Env('localhost:32822') env.make(ENV_NAME) score_logger = ScoreLogger(ENV_NAME) observation_space = env.observation_space.box.shape[0] action_space = env.action_space.discrete.n dqn_solver = DQNSolver(observation_space, action_space) run = 0 while True: run += 1 state = env.reset() # print(state) state = np.reshape(state, [1, observation_space]) step = 0 while True: step += 1 # env.render() print("acting on state: ", state) action = dqn_solver.act(state) state_next, reward, terminal, info = env.step(action) reward = reward if not terminal else -reward state_next = np.reshape(state_next, [1, observation_space]) dqn_solver.remember(state, action, reward, state_next, terminal) state = state_next if terminal: print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step)) score_logger.add_score(step, run) plt.plot(dqn_solver.loss) plt.title('Model loss') plt.ylabel('Loss') plt.xlabel('Episode') plt.savefig("loss.png") break dqn_solver.experience_replay()
def experiment1_test( output_folder, word_vectors, agent, episode_index, testset_path='./dataset/conll2003/en/eng.testb', ): # 初始化环境 env = Env(testset_path, word_vectors) step = 0 s = env.reset() print('[' + util.now_time() + "] start testing...") while True: # check task is ended if env.end(): print('[' + util.now_time() + "] testing...done") result_file = '%03d_episode_test.txt' % (episode_index + 1) env.save_all_newlines_to_file(output_folder, result_file) return evaluate.conlleval(output_folder, result_file) # Choose Action a a = agent.choose_action(s) # Execute action s_, r = env.step(a) # Next status step += 1 s = s_
def test(self, num_actions): self.saver.restore(self.session, FLAGS.checkpoint_path) print "Restored model weights from ", FLAGS.checkpoint_path monitor_env = gym.make(FLAGS.game) monitor_env.monitor.start("/tmp/" + FLAGS.game ,force=True) env = Env(env, FLAGS.width, FLAGS.height, FLAGS.history_length, FLAGS.game_type) for i_episode in xrange(FLAGS.num_eval_episodes): state = env.get_initial_state() episode_reward = 0 done = False # create state sequence state_sequence = np.zeros((t_max, FLAGS.history_length, FLAGS.width, FLAGS.height)) state_sequence[t_max -1, :, :, :] = state while not done: monitor_env.render() q_values = self.q_values.eval(session = self.session, feed_dict = {self.state : [state_sequence]}) action_index = np.argmax(q_values) new_state, reward, done = env.step(action_index) state = new_state # update state sequence state_sequence = np.delete(state_sequence, 0, 0) state_sequence = np.insert(state_sequence, t_max-1, state, 0) episode_reward += reward print "Finished episode " + str(i_episode + 1) + " with score " + str(episode_reward) monitor_env.monitor.close()
def play_deterministic(self, n_tot): self.model.eval() env = Env() render = args.render n_human = 60 humans_trajectories = iter(self.data) reverse_excitation_index = consts.reverse_excitation_index for i in range(n_tot): env.reset() observation = next(humans_trajectories) print("Observation %s" % observation) trajectory = self.data[observation] j = 0 while not env.t: if j < n_human: a = trajectory[j, self.meta['action']] else: if self.cuda: s = Variable(env.s.cuda(), requires_grad=False) else: s = Variable(env.s, requires_grad=False) _, _, beta, _, _, _ = self.model(s) beta = beta.squeeze(0) beta = (beta.sign().int() * (beta.abs() > 0.5).int()).data if self.cuda: beta = beta.cpu().numpy() else: beta = beta.numpy() beta[0] = abs(beta[0]) a = reverse_excitation_index[tuple(beta)] env.step(a) j += 1 yield {'o': env.s.cpu().numpy(), 'score': env.score}
def experiment1_train( output_folder, word_vectors, n_episodes=300, trainset_path='./dataset/conll2003/en/eng.train', ): # 初始化环境 print('[' + util.now_time() + "] init environment...") env = Env(trainset_path, word_vectors) print('[' + util.now_time() + "] 环境初始化完毕") # 初始化DQN print('[' + util.now_time() + "] init agent...") agent = DQN(n_actions=env.n_actions, status_dim=env.status_dim, action_dim=env.action_dim, reward_dim=env.reward_dim) print('[' + util.now_time() + "] agent初始化完毕") # 迭代episodes for i in range(n_episodes): print('[' + util.now_time() + "] start episode %03d of learning..." % (i + 1)) step = 0 s = env.reset() while True: # check task is ended if env.end(): print('[' + util.now_time() + "] episode %03d of learning...done" % (i + 1)) result_file = '%03d_episode_train.txt' % (i + 1) env.save_all_newlines_to_file(output_folder, result_file) train_eval = evaluate.conlleval(output_folder, result_file) test_eval = experiment1_test(output_folder, word_vectors, agent, i) break # Choose Action a a = agent.choose_action(s) # Execute action # print('step %d' % step) s_, r = env.step(a) agent.store_transition(s, a, r, s_) step += 1 s = s_ if step > 200 and step % 5 == 0: agent.learn() # plot and compare train and test set TODO # plot(train_evals,test_evals) agent.eval_network.save(output_folder + os.path.sep + 'ex1_eval_model', overwrite=True)
def test(self, env): # initialize environment env = Env(env, 84, 84, 4) terminal = False # Get initial game observation state = env.get_initial_state() # episode's reward and cost episode_reward = 0 for _ in range(100): while not terminal: # forward pass of network. Get probability of all actions probs, v = self.sess.run((self.policy, self.state_value), feed_dict={self.input_state: [state]}) probs = probs[0] v = v[0][0] if random.random() < 0.01: action_index = random.choice([0, 1, 2, 3]) else: action_index = np.argmax(probs) # Gym excecutes action in game environment on behalf of actor-learner new_state, reward, terminal = env.step(action_index) env.env.render() # clip reward to -1, 1 # Update the state and global counters state = new_state # update episode's counter episode_reward += reward if terminal: terminal = False print "THREAD:", self.thread_id, "/ TIME", T, "/ REWARD", \ episode_reward, "/ COST" episode_reward = 0 counter = 0 # Get initial game observation state = env.get_initial_state()
def test(self, num_actions): self.saver.restore(self.session, FLAGS.checkpoint_path) print "Restored model weights from ", FLAGS.checkpoint_path monitor_env = gym.make(FLAGS.game) monitor_env.monitor.start("/tmp/" + FLAGS.game ,force=True) env = Env(monitor_env, FLAGS.width, FLAGS.height, FLAGS.history_length, FLAGS.game_type) for i_episode in xrange(FLAGS.num_eval_episodes): state = env.get_initial_state() episode_reward = 0 done = False while not done: monitor_env.render() probs = self.session.run(self.policy_values, feed_dict={self.state: [state]})[0] action_index = sample_policy_action(num_actions, probs) new_state, reward, done = env.step(action_index) state = new_state episode_reward += reward print "Finished episode " + str(i_episode + 1) + " with score " + str(episode_reward) monitor_env.monitor.close()
def train(self, env, checkpoint_interval, checkpoint_dir, saver, gamma=0.99): global T self.saver = saver # initialize environment time.sleep(3 * self.thread_id) env = Env(env, 84, 84, 4) print 'Starting thread ' + str(self.thread_id) terminal = False # Get initial game observation state = env.get_initial_state() # episode's reward and cost episode_reward = 0 total_cost = 0 counter = 0 while T < self.TMAX: # lists for feeding placeholders states = [] actions = [] prev_reward = [] state_values = [] t = 0 t_start = t self.sess.run(self.sync_op) while not (terminal or ((t - t_start) == self.tmax)): # forward pass of network. Get probability of all actions probs, v = self.sess.run((self.policy, self.state_value), feed_dict={self.input_state: [state]}) probs = probs[0] v = v[0][0] # print the outputs of the neural network fpr sanity chack if T % 2000 == 0: print probs print v # define list of actions. All values are zeros except , the # value of action that is executed action_list = np.zeros([self.output_size]) # choose action based on policy action_index = sample_policy_action(probs) action_list[action_index] = 1 # add state and action to list actions.append(action_list) states.append(state) state_values.append(v) # Gym executes action in game environment on behalf of actor-learner new_state, reward, terminal = env.step(action_index) # clip reward to -1, 1 clipped_reward = np.clip(reward, -1, 1) prev_reward.append(clipped_reward) # Update the state and global counters state = new_state T += 1 t += 1 counter += 1 # update episode's counter episode_reward += reward # Save model progress if T % checkpoint_interval < 200: T += 200 self.saver.save(self.sess, checkpoint_dir + "/breakout.ckpt", global_step=T) if terminal: R_t = 0 else: R_t = self.sess.run(self.state_value, feed_dict={self.input_state: [state]}) R_t = R_t[0][0] state_values.append(R_t) targets = np.zeros((t - t_start)) for i in range(t - t_start - 1, -1, -1): R_t = prev_reward[i] + gamma * R_t targets[i] = R_t # compute the advantage based on GAE # code from https://github.com/openai/universe-starter-agent delta = np.array(prev_reward) + gamma * np.array( state_values[1:]) - np.array(state_values[:-1]) advantage = scipy.signal.lfilter([1], [1, -gamma], delta[::-1], axis=0)[::-1] # update the global network cost, _ = self.sess.run( (self.loss, self.opt), feed_dict={ self.input_state: states, self.actions: actions, self.targets: targets, self.advantage: advantage }) total_cost += cost if terminal: terminal = False print "THREAD:", self.thread_id, "/ TIME", T, "/ REWARD", \ episode_reward, "/ COST", total_cost/counter episode_reward = 0 total_cost = 0 counter = 0 # Get initial game observation state = env.get_initial_state()
score = 0 step = 0 state = env.reset() state = np.reshape(state, [1, state_size]) while not done: # if episode < agent.initial_train_episodes and agent.load_model == False: # explore in the first episode # if step % 4 < 2: # action_index = random.randrange(6, 11) # else: # action_index = random.randrange(0, 5) # else: # action_index = agent.get_action(state) action_index = agent.get_action(state) next_state, reward, done, info = env.step(action_index) next_state = np.reshape(next_state, [1, state_size]) step += 1 rad = math.acos(next_state[0][0]) print( "episode:{0} || step:{1} || action:{2} || pendulum radian:{3} || reward:{4} || done:{5}" .format( episode, step, action_index, # round(next_state[0][0], 4), rad, round(reward, 2), done))
def main(): rospy.init_node('ddpg_stage_1') env = Env(is_training) agent = DDPG(env, state_dim, action_dim) past_action = np.array([0., 0.]) print('State Dimensions: ' + str(state_dim)) print('Action Dimensions: ' + str(action_dim)) print('Action Max: ' + str(action_linear_max) + ' m/s and ' + str(action_angular_max) + ' rad/s') if is_training: print('Training mode') avg_reward_his = [] total_reward = 0 var = 1. while True: state = env.reset() one_round_step = 0 while True: a = agent.action(state) a[0] = np.clip(np.random.normal(a[0], var), 0., 1.) a[1] = np.clip(np.random.normal(a[1], var), -0.5, 0.5) state_, r, done, arrive = env.step(a, past_action) time_step = agent.perceive(state, a, r, state_, done) if arrive: result = 'Success' else: result = 'Fail' if time_step > 0: total_reward += r if time_step % 10000 == 0 and time_step > 0: print( '---------------------------------------------------') avg_reward = total_reward / 10000 print('Average_reward = ', avg_reward) avg_reward_his.append(round(avg_reward, 2)) print('Average Reward:', avg_reward_his) total_reward = 0 if time_step % 5 == 0 and time_step > exploration_decay_start_step: var *= 0.9999 past_action = a state = state_ one_round_step += 1 if arrive: print('Step: %3i' % one_round_step, '| Var: %.2f' % var, '| Time step: %i' % time_step, '|', result) one_round_step = 0 if done or one_round_step >= 500: print('Step: %3i' % one_round_step, '| Var: %.2f' % var, '| Time step: %i' % time_step, '|', result) break else: print('Testing mode') while True: state = env.reset() one_round_step = 0 while True: a = agent.action(state) a[0] = np.clip(a[0], 0., 1.) a[1] = np.clip(a[1], -0.5, 0.5) state_, r, done, arrive = env.step(a, past_action) past_action = a state = state_ one_round_step += 1 if arrive: print('Step: %3i' % one_round_step, '| Arrive!!!') one_round_step = 0 if done: print('Step: %3i' % one_round_step, '| Collision!!!') break
class Training: def __init__(self): self.n_episode = [] self.n_epsilon = [] self.n_dist = [] self.avg_err = [] self.logging_data = [] # Parameters self.n_episodes = rospy.get_param("/n_episodes") self.n_step = rospy.get_param("/n_steps") self.mode_action = rospy.get_param('/mode_action') self.mem_size = rospy.get_param('/mem_size') self.batch_size = rospy.get_param('/batch_size') self.mode_optimize = rospy.get_param('/mode_optimize') self.avg_err_fre = rospy.get_param('/avg_err_fre') self.save_fre = rospy.get_param("/save_fre") self.load_checkpoint = rospy.get_param("/load_checkpoint") # create environment self.env = Env() self.n_states = self.env.observation_space self.n_actions = self.env.action_space.n # create Deep Q-Network self.dqn = DQN(self.n_states, self.n_actions) self.memory = ExperienceReplay(self.mem_size) # plot self.color1 = 'tab:green' self.color2 = 'tab:blue' self.color3 = 'tab:orange' self.color4 = 'tab:red' self.style_plot = random.choice(plt.style.available) plt.style.use(self.style_plot) plt.ion() ########### # Figure 1 - Rewards self.fig1 = plt.figure(1) # fig = plt.figure(figsize=(12,5)) self.ax1 = self.fig1.add_subplot(1, 1, 1) self.ax2 = self.ax1.twinx() title_1 = 'Rewards - (Mode: Training)' self.ax1.set_title(title_1) self.ax1.set_xlabel('Episode') self.ax1.set_ylabel('Reward', color=self.color1) self.ax2.set_ylabel('Epsilon', color=self.color2) self.ax1.tick_params(axis='y', labelcolor=self.color1) self.ax2.tick_params(axis='y', labelcolor=self.color2) ########### # Figure 2 - Error self.fig2 = plt.figure(2) self.ax3 = self.fig2.add_subplot(1, 1, 1) title_2 = 'Error Distance - (Mode: Training)' self.ax3.set_title(title_2) self.ax3.set_xlabel('Episode') self.ax3.set_ylabel('Meter') self.init_file() def moving_average(self, x, w): return np.convolve(x, np.ones(w), 'valid') / w def init_file(self): rospack = rospkg.RosPack() data_path = rospack.get_path("pioneer_dragging") + "/data" username = getpass.getuser() # n_folder = len(os.walk(data_path).__next__()[1]) n_folder = glob("{}/{}*".format(data_path, username)) n_folder = len(n_folder) + 1 if self.load_checkpoint: n_folder -= 1 self.data_path = "{}/{}-{}".format(data_path, username, n_folder) if not os.path.exists(self.data_path): os.mkdir(self.data_path) # config file if not self.load_checkpoint: config_path = rospack.get_path( "pioneer_dragging") + "/config/dragging_params.yaml" config_log = '{}/{}-params.yaml'.format(self.data_path, n_folder) os.system('cp {} {}'.format(config_path, config_log)) plot_style = {'plot_style': self.style_plot} with open(config_log, 'r') as yamlfile: cur_yaml = yaml.safe_load(yamlfile) # Note the safe_load cur_yaml.update(plot_style) if cur_yaml: with open(config_log, 'w') as yamlfile: yaml.safe_dump(cur_yaml, yamlfile) # Also note the safe_dump # history file self.history_log = '{}/{}-log.txt'.format(self.data_path, n_folder) # model file self.dqn.file_models = '{}/{}-pytorch-RL.tar'.format( self.data_path, n_folder) # memory file self.memory.file_mem = '{}/{}-memory.data'.format( self.data_path, n_folder) # figures file self.figure1 = '{}/{}-Rewards(Training).png'.format( self.data_path, n_folder) self.figure2 = '{}/{}-Error(Training).png'.format( self.data_path, n_folder) def plot_result(self, i_episode, cumulated_reward, epsilon, error_dist, loaded=False): ### Figure 1 # plot bar (cumulated reward) self.ax1.bar(i_episode, cumulated_reward, color=self.color1) # plot line (epsilon decay ) if loaded: self.ax2.plot(i_episode, epsilon, color=self.color2) self.n_episode = i_episode.tolist() self.n_epsilon = epsilon.tolist() self.n_dist = error_dist.tolist() else: self.n_episode.append(i_episode) self.n_epsilon.append(epsilon) self.ax2.plot(self.n_episode, self.n_epsilon, color=self.color2) self.n_dist.append(error_dist) ### Figure 2 # plot bar (error distance) self.ax3.bar(i_episode, error_dist, color=self.color3) # window_err = np.array(self.n_dist) # window_err = np.mean(window_err) # self.avg_err.append(window_err) # self.ax3.plot(self.n_episode, self.avg_err, color=self.color4) # plot line (average error distance) if len(self.n_dist) % self.avg_err_fre == 0: avg_err = self.moving_average(np.array(self.n_dist), self.avg_err_fre) self.ax3.plot(avg_err, color=self.color4) plt.draw() plt.pause(0.1) def run(self): start_time = time.time() if self.load_checkpoint: self.memory.load() self.dqn.load_model() # history log loaded self.logging_data = [ line.rstrip('\n') for line in open(self.history_log) ] hist_data = pd.read_csv(self.history_log, sep=",") i_episode = hist_data['i_episode'] cumulated_reward = hist_data['cumulated_reward'] epsilon = hist_data['epsilon'] error_dist = hist_data['error_dist'] self.plot_result(i_episode, cumulated_reward, epsilon, error_dist, loaded=True) i_episode = hist_data['i_episode'].iloc[-1] + 1 self.dqn.epsilon = hist_data['epsilon'].iloc[-1] rospy.loginfo('[RL] Loaded checkpoint') else: i_episode = 0 ######################################### ###### Reinfrocement Training loop ###### for i_episode in range(i_episode, self.n_episodes): state = self.env.reset(i_episode) cumulated_reward = 0 steps = 0 step_time = time.time() while not rospy.is_shutdown(): steps += 1 action, epsilon = self.dqn.select_action(state, i_episode) # print('num_steps: {}, epsilon: {}, steps_done: {}'.format(steps, epsilon, dqn.steps_done)) # action = env.action_space.sample() rospy.loginfo('[RL] action: {}'.format(action)) next_state, reward, done, info = self.env.step(action) self.memory.push(state, action, next_state, reward, done) cumulated_reward += reward ################################ ######### optimize ############# if self.mode_optimize == 'normal_dqn': # without experience replay memory self.dqn.optimize(state, action, next_state, reward, done) elif self.mode_optimize == 'dqn_replay_memory': # with experience replay memory if len(self.memory) > self.batch_size: state_mem, action_mem, next_state_mem, reward_mem, done_mem = self.memory.sample( self.batch_size) self.dqn.optimize_with_replay_memory( state_mem, action_mem, next_state_mem, reward_mem, done_mem) elif self.mode_optimize == 'dqn_taget_net': # with experience target net if len(self.memory) > self.batch_size: state_mem, action_mem, next_state_mem, reward_mem, done_mem = self.memory.sample( self.batch_size) self.dqn.optimize_with_DQN(state_mem, action_mem, next_state_mem, reward_mem, done_mem) elif self.mode_optimize == 'dueling_dqn': # with double DQN if len(self.memory) > self.batch_size: state_mem, action_mem, next_state_mem, reward_mem, done_mem = self.memory.sample( self.batch_size) self.dqn.optimize_with_dueling_DQN( state_mem, action_mem, next_state_mem, reward_mem, done_mem) if not done: state = next_state else: break # DQN update param self.dqn.update_param(i_episode) # Plotting error_dist = self.env.calc_dist() self.plot_result(i_episode, cumulated_reward, epsilon, error_dist) # Save Checkpoint temp_data = "{},{},{},{}".format(i_episode, cumulated_reward, epsilon, error_dist) self.logging_data.append(temp_data) if i_episode % self.save_fre == 0: rospy.loginfo('[RL] Save checkpoint: {}'.format(i_episode)) self.dqn.save_model() # save models self.memory.save() # save replay memory # logging file with open(self.history_log, 'w') as f: if not self.load_checkpoint: f.write( "i_episode,cumulated_reward,epsilon,error_dist\n") for item in self.logging_data: f.write("%s\n" % item) # save figures self.fig1.savefig(self.figure1, dpi=self.fig1.dpi) self.fig2.savefig(self.figure2, dpi=self.fig2.dpi) rospy.loginfo('[RL] Save figure1: {}'.format(self.figure1)) rospy.loginfo('[RL] Save figure2: {}'.format(self.figure2)) # Timing elapsed_time = time.time() - step_time total_time = time.time() - start_time print('\n********') print("Elapsed time: {}".format( time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))) print("Total time: {}".format( time.strftime("%H:%M:%S", time.gmtime(total_time)))) # Finish Training self.env.close() print() rospy.loginfo('[RL] Exit ...') total_time = time.time() - start_time print('\n*********************') print("Total time: ", time.strftime("%H:%M:%S", time.gmtime(total_time))) rospy.loginfo('[RL] Style plot: {}'.format(self.style_plot)) plt.show(block=True)
his_hdqn, his_sto = [], [] his_gtd, his_gcd, his_gac = [], [], [] state = env.init() while True: # 1. DQN # action = dqn.choose_action(state) # RL choose action based on state # next_state, reward, done, env_data = env.step(state, action) # RL take action and get next observation and reward # dqn.store_transition(state, action, reward, next_state) # if dqn.memory_counter > dqn.memory_size and step % 5 == 0: # dqn.learn() # HDQN goal = hdqn.ac_agent.choose_goal(state) action = hdqn.ts_agent.choose_action(state, [goal]) next_state, reward, done, env_data = env.step(state, action) # 2.Stochastic a_sto = sto.choose_action() r_sto, env_sto = env.step(state, a_sto, is_update=False) # 3.Greedy Trans Delay a_gtd = greed.choose_action(state, trans_delay=True) r_gtd, env_gtd = env.step(state, a_gtd, is_update=False) # 4.Greedy Calcul Delay a_gcd = greed.choose_action(state, cal_delay=True) r_gcd, env_gcd = env.step(state, a_gcd, is_update=False) # 5.Greedy Accuracy a_gac = greed.choose_action(state, accuracy=True) r_gac, env_gac = env.step(state, a_gac, is_update=False) # history record
def actor_learner_thread(self, env, thread_id, num_actions): # create instance of Doom environment env = Env(env, FLAGS.width, FLAGS.height, FLAGS.history_length, FLAGS.game_type) print 'Starting thread ' + str(thread_id) time.sleep(3*thread_id) # Get initial game observation state = env.get_initial_state() # episode's counter episode_reward = 0 counter = 0 while self.T < self.TMAX: done = False # clear gradients states = [] actions = [] prev_reward = [] t = 0 t_start = t # synchronize policy and value network self.session.run(self.update_policy[thread_id]) self.session.run(self.update_value[thread_id]) while not (done or ((t - t_start) == t_max)): # forward pass of network. Get probability of all actions probs = self.session.run(self.local_policy[thread_id], feed_dict={self.local_states[thread_id]: [state]})[0] # define list of actions. All values are zeros except , the # value of action that is executed action_list = np.zeros([num_actions]) # choose action based on policy action_index = sample_policy_action(num_actions, probs) action_list[action_index] = 1 # add state and action to list actions.append(action_list) states.append(state) # Gym excecutes action in game environment on behalf of actor-learner new_state, reward, done = env.step(action_index) # clip reward to -1, 1 clipped_reward = np.clip(reward, -1, 1) prev_reward.append(clipped_reward) # Update the state and global counters state = new_state self.T += 1 t += 1 counter += 1 # update episode's counter episode_reward += reward # Save model progress if counter % FLAGS.checkpoint_interval == 0: if FLAGS.game_type == 'Doom': self.saver.save(self.session, FLAGS.checkpoint_dir+"/" + FLAGS.game.split("/")[1] + ".ckpt" , global_step = counter) else: self.saver.save(self.session, FLAGS.checkpoint_dir+"/" + FLAGS.game + ".ckpt" , global_step = counter) if done: R_t = 0 else: R_t = self.session.run(self.local_value[thread_id], feed_dict = {self.local_states[thread_id] : [state]})[0][0] targets = np.zeros((t - t_start)) for i in range(t - t_start -1 , -1, -1): R_t = prev_reward[i] + FLAGS.gamma * R_t targets[i] = R_t #update q value network self.session.run(self.grad_update, feed_dict = {self.state: states, self.actions: actions, self.targets: targets}) if done: print "THREAD:", thread_id, "/ TIME", self.T, "/ TIMESTEP", counter, "/ REWARD", episode_reward episode_reward = 0 # Get initial game observation state = env.get_initial_state()
from environment import Env import pandas as pd import random data = pd.read_csv("data/target.csv")["Y"] env = Env(data) done = False while (done == False): price = env.getState() print(price) action = random.randint(0, 3) # 0 == buy, 1 == sell, 2 == do nothing _, done = env.step(action) env.render()
def play_episode(self, n_tot): self.beta_net.eval() self.beta_target.eval() self.pi_net.eval() self.pi_target.eval() self.vb_net.eval() self.vb_target.eval() self.q_net.eval() self.q_target.eval() self.qb_net.eval() self.qb_target.eval() env = Env() n_human = 120 humans_trajectories = iter(self.data) softmax = torch.nn.Softmax() for i in range(n_tot): env.reset() observation = next(humans_trajectories) trajectory = self.data[observation] choices = np.arange(self.global_action_space, dtype=np.int) mask = Variable(torch.FloatTensor( [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0]), requires_grad=False).cuda() j = 0 temp = 1 while not env.t: s = Variable(env.s.cuda(), requires_grad=False) beta, phi = self.beta_net(s) pi, _ = self.pi_net(s) q, _ = self.q_net(s) vb, _ = self.vb_net(s) pi = beta.squeeze(0) self.greedy = False if j < n_human: a = trajectory[j, self.meta['action']] else: # eps = np.random.rand() eps = 1 # a = np.random.choice(choices) if self.greedy and eps > 0.01: a = pi.data.cpu().numpy() a = np.argmax(a) else: a = softmax(pi / temp).data.cpu().numpy() a = np.random.choice(choices, p=a) q = q[0, a] q = q.squeeze(0) env.step(a) yield { 'o': env.s.cpu().numpy(), 'v': vb.squeeze(0).data.cpu().numpy(), 'vb': vb.squeeze(0).data.cpu().numpy(), 'qb': q.squeeze(0).data.cpu().numpy(), # 's': x[0, :512].data.cpu().numpy(), 'score': env.score, 'beta': pi.data.cpu().numpy(), 'phi': phi.squeeze(0).data.cpu().numpy(), 'q': q.squeeze(0).data.cpu().numpy() } j += 1 raise StopIteration
def main(): rospy.init_node('ddpg_stage_1') env = Env(is_training) agent = DDPG(env, state_dim, action_dim) # import ipdb # ipdb.set_trace() past_action = np.array([0., 0.]) print('State Dimensions: ' + str(state_dim)) print('Action Dimensions: ' + str(action_dim)) print('Action Max: ' + str(action_linear_max) + ' m/s and ' + str(action_angular_max) + ' rad/s') print('Action Min: ' + str(action_linear_min) + ' m/s and ' + str(action_angular_min) + ' rad/s') ######################################################################################### # Training ######################################################################################### if is_training: print('Training mode') avg_reward_his = [] total_reward = 0 action_var = 0.2 success_rate = 0 # Log path setting now = datetime.datetime.now() logdir = now.strftime('%Y-%M-%d') + '_' + now.strftime('%H-%M') logdir = os.path.join(log_dir, logdir) # tb_writer = SummaryWriter(logdir) # Start training start_time = time.time() for itr in range(10000): state = env.reset() # episode_reward = 0.0 # For each episode for cur_step in range(max_episode_length): action = agent.action(state) action[0] = np.clip(np.random.normal(action[0], action_var), action_linear_min, action_linear_max) action[1] = np.clip(np.random.normal(action[1], action_var), action_angular_min, action_angular_max) state_, reward, done, arrive = env.step(action, past_action) time_step = agent.perceive(state, action, reward, state_, done) ######################################################################################## # debugging environment ######################################################################################## if is_debugging: print('cur_step: {}'.format(cur_step)) print('action: {}'.format(action)) print('goal position: x:{}, y:{}'.format( env.goal_position.position.x, env.goal_position.position.y)) print('r: {}, done: {}, arrive: {}'.format( reward, done, arrive)) ######################################################################################## result = 'Success' if arrive else 'Fail' if time_step > 0: total_reward += reward if time_step % 10000 == 0 and time_step > 0: print( '---------------------------------------------------') avg_reward = total_reward / 10000 print('Average_reward: {}'.format(avg_reward)) avg_reward_his.append(round(avg_reward, 2)) # writer.add_scalar('avg_reward', avg_reward, time_step) print('Overall average Reward: {}'.format(avg_reward_his)) total_reward = 0 if time_step % 5 == 0 and time_step > exploration_decay_start_step: action_var *= 0.9999 past_action = action state = state_ if arrive or done or cur_step >= max_episode_length: if result == 'Success': success_rate += 1 sec = time.time() - start_time elapsed_time = str( datetime.timedelta(seconds=sec)).split('.')[0] print( 'Num_episode: {}, Full steps: {}, Result: {}, Elapsed time: {}' .format(itr, cur_step, result, elapsed_time)) if itr % 20 == 0 and itr > 0: print('Total: {}/20, Success rate: {}'.format( success_rate, round(success_rate / 20), 2)) success_rate = 0 break ######################################################################################### # Testing ######################################################################################### else: print('Testing mode') while True: state = env.reset() one_round_step = 0 while True: a = agent.action(state) a[0] = np.clip(a[0], 0., 1.) a[1] = np.clip(a[1], -0.5, 0.5) state_, reward, done, arrive = env.step(a, past_action) past_action = a state = state_ one_round_step += 1 if arrive: print('Step: %3i' % one_round_step, '| Arrive!!!') one_round_step = 0 if done: print('Step: %3i' % one_round_step, '| Collision!!!') break
def main(): expert_demo = pickle.load(open('./Ree1_expert.p', "rb")) # Ree1 : action 1 # Ree2 : action 100 # Ree3 : action 50 # Ree4 : action 10 # Ree5 : action 4 # Ree6 : action 0.5 # print('expert_demo_shape : ', np.array(expert_demo).shape) expert_x = int(expert_demo[1][0]) expert_y = int(expert_demo[1][1]) env = Env(expert_x, expert_y) # env = Env(0,0) # env.seed(args.seed) torch.manual_seed(args.seed) num_inputs = 2 num_actions = 8 running_state = ZFilter((num_inputs, ), clip=5) print('state size:', num_inputs) print('action size:', num_actions) actor = Actor(num_inputs, num_actions, args) critic = Critic(num_inputs, args) discrim = Discriminator(num_inputs + num_actions, args) actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate) critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, weight_decay=args.l2_rate) discrim_optim = optim.Adam(discrim.parameters(), lr=args.learning_rate) # load demonstrations # expert_demo, _ = pickle.load(open('./expert_demo/expert_demo.p', "rb")) demonstrations = np.array(expert_demo[0]) # print("demonstrations.shape", demonstrations.shape) writer = SummaryWriter(args.logdir) if args.load_model is not None: saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) actor.load_state_dict(ckpt['actor']) critic.load_state_dict(ckpt['critic']) discrim.load_state_dict(ckpt['discrim']) running_state.rs.n = ckpt['z_filter_n'] running_state.rs.mean = ckpt['z_filter_m'] running_state.rs.sum_square = ckpt['z_filter_s'] print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n)) episodes = 0 train_discrim_flag = True for iter in range(args.max_iter_num): actor.eval(), critic.eval() memory = deque() steps = 0 scores = [] while steps < args.total_sample_size: state = env.reset() score = 0 state = running_state(state) for _ in range(1000): if args.render: env.render() steps += 1 mu, std = actor(torch.Tensor(state).unsqueeze(0)) action2 = np.argmax(get_action(mu, std)[0]) action = get_action(mu, std)[0] next_state, reward, done, _ = env.step(action2) # next_state, reward, done, _ = env.step(action) irl_reward = get_reward(discrim, state, action) if done: mask = 0 else: mask = 1 memory.append([state, action, irl_reward, mask]) next_state = running_state(next_state) state = next_state score += reward if done: break episodes += 1 scores.append(score) score_avg = np.mean(scores) print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg)) writer.add_scalar('log/score', float(score_avg), iter) actor.train(), critic.train(), discrim.train() if train_discrim_flag: expert_acc, learner_acc = train_discrim(discrim, memory, discrim_optim, demonstrations, args) print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100)) temp_learner.append(learner_acc * 100) temp_expert.append(expert_acc * 100) if ((expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen and iter % 55 == 0) or iter % 50 == 0): # train_discrim_flag = False plt.plot(temp_learner, label='learner') plt.plot(temp_expert, label='expert') plt.xlabel('Episode') plt.ylabel('Accuracy') plt.xticks([]) plt.legend() plt.savefig('accuracy{}.png'.format(iter)) # plt.show() model_path = 'C:/Users/USER/9 GAIL/lets-do-irl/mujoco/gail' ckpt_path = os.path.join(model_path, 'ckpt_' + str(score_avg) + '.pth.tar') print("check path", ckpt_path) save_checkpoint( { 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'discrim': discrim.state_dict(), 'z_filter_n': running_state.rs.n, 'z_filter_m': running_state.rs.mean, 'z_filter_s': running_state.rs.sum_square, 'args': args, 'score': score_avg }, filename=ckpt_path) train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args) if iter % 100: score_avg = int(score_avg) model_path = os.path.join(os.getcwd(), 'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) model_path = 'C:/Users/USER/9 GAIL/lets-do-irl/mujoco/gail' ckpt_path = os.path.join(model_path, 'ckpt_' + str(score_avg) + '.pth.tar') save_checkpoint( { 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'discrim': discrim.state_dict(), 'z_filter_n': running_state.rs.n, 'z_filter_m': running_state.rs.mean, 'z_filter_s': running_state.rs.sum_square, 'args': args, 'score': score_avg }, filename=ckpt_path) plt.plot(temp_learner) plt.plot(temp_expert) plt.xlabel('Episode') plt.ylabel('Accuracy') plt.xticks([]) plt.savefig('accuracy.png')
class DDPGStage: def __init__(self, model, is_training=False, var=1.): self.max_step = 200 self.exploration_decay_start_step = 50000 state_dim = 366 action_dim = 2 self.action_linear_max = 0.25 # m/s self.action_angular_max = 0.5 # rad/s rospy.init_node('ddpg_stage_1') rospy.on_shutdown(self.clear_vel) self.is_training = is_training if ['/gazebo/model_states', 'gazebo_msgs/ModelStates'] in rospy.get_published_topics(): self.env = SimEnv(self.is_training) print("Gazebo mode") else: self.env = Env(self.is_training) print("Real world mode") self.agent = DDPG(model, self.env, state_dim, action_dim) self.past_action = np.array([0., 0.]) print('State Dimensions: ' + str(state_dim)) print('Action Dimensions: ' + str(action_dim)) print('Action Max: ' + str(self.action_linear_max) + ' m/s and ' + str(self.action_angular_max) + ' rad/s') self.var = var def _train(self): print('Training mode') avg_reward_his = [] total_reward = 0 while not rospy.is_shutdown(): state = self.env.reset() one_round_step = 0 while not rospy.is_shutdown(): a = self.agent.action(state) a[0] = np.clip(np.random.normal(a[0], self.var), 0., 1.) a[1] = np.clip(np.random.normal(a[1], self.var), -0.5, 0.5) state_, r, collision, arrive = self.env.step(a) time_step = self.agent.perceive(state, a, r, state_, collision) if time_step > 0: total_reward += r if time_step % 10000 == 0 and time_step > 0: print('---------------------------------------------------') avg_reward = total_reward / 10000 print('Average_reward = ', avg_reward) avg_reward_his.append(round(avg_reward, 2)) print('Average Reward:', avg_reward_his) total_reward = 0 if time_step % 5 == 0 and time_step > self.exploration_decay_start_step and self.var > 0.1: self.var *= 0.9999 state = state_ one_round_step += 1 plt.title("STEP %d, Reward: %.2f" % (one_round_step, r)) result = 'Step: %3i | Reward: %.2f | Var: %.2f | Time step: %i |' % (one_round_step, r, self.var, time_step) if arrive: print(result, 'Success') one_round_step = 0 self.env.common_reset() elif collision: print(result, 'Collision') break elif one_round_step >= self.max_step: print(result, 'Failed') break def _evaluate(self): print('Testing mode') self.env.goal_range["x"] = [-1, 1] self.env.goal_range["y"] = [-1, 1] while not rospy.is_shutdown(): state = self.env.reset() one_round_step = 0 while not rospy.is_shutdown(): a = self.agent.action(state) print("action: %s" % a) a[0] = np.clip(a[0], 0., 1.) a[1] = np.clip(a[1], -0.5, 0.5) state_, r, collision, arrive = self.env.step(a) state = state_ one_round_step += 1 plt.title("STEP %d, Reward: %.2f" % (one_round_step, r)) result = 'Step: %3i | Reward: %.2f | Var: %.2f |' % ( one_round_step, r, self.var) if arrive: print(result, 'Success') one_round_step = 0 self.env.common_reset() # input() elif collision: print(result, 'Collision') break elif one_round_step >= self.max_step: print(result, 'Failed') break def run(self): # try: if self.is_training: self._train() else: self._evaluate() self.env.pub_cmd_vel.publish(Twist()) def clear_vel(self): self.env.pub_cmd_vel.publish(Twist())
def play_episode(self, n_tot): self.model.eval() self.model_b.eval() env = Env() n_human = 120 humans_trajectories = iter(self.data) softmax = torch.nn.Softmax() for i in range(n_tot): env.reset() observation = next(humans_trajectories) trajectory = self.data[observation] choices = np.arange(self.global_action_space, dtype=np.int) mask = Variable(torch.FloatTensor([0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), requires_grad=False).cuda() j = 0 temp = 1 while not env.t: s = Variable(env.s.cuda(), requires_grad=False) beta, vb, qb, _, _ = self.model_b(s, self.actions_matrix) pi, v, q, adv, x = self.model(s, self.actions_matrix, beta.detach()) pi = pi.squeeze(0) self.greedy = False if j < n_human: a = trajectory[j, self.meta['action']] else: eps = np.random.rand() # a = np.random.choice(choices) if self.greedy and eps > 0.1: a = pi.data.cpu().numpy() a = np.argmax(a) else: a = softmax(pi/temp).data.cpu().numpy() a = np.random.choice(choices, p=a) q = q[0, a, 0] q = q.squeeze(0) qb = qb[0, a, 0] qb = qb.squeeze(0) env.step(a) yield {'o': env.s.cpu().numpy(), 'v': v.squeeze(0).data.cpu().numpy(), 'vb': vb.squeeze(0).data.cpu().numpy(), 'qb': qb.squeeze(0).data.cpu().numpy(), 's': x[0, :512].data.cpu().numpy(), 'score': env.score, 'beta': pi.data.cpu().numpy(), 'phi': x[0, :512].data.cpu().numpy(), 'q': q.squeeze(0).data.cpu().numpy()} j += 1 raise StopIteration
for i_episode in range(default_config["max_iteration"]): attack_mode = random.randint(0, 6) state_new = env.reset(attack_mode) agent.update_current_channel(state_new) done = False for t in range(default_config["max_episode_length"]): # Get current channel x = np.zeros(default_config["max_channel"]) x[agent.cur_channel] = 1 # Put into the NN action_c = agent.c_policy.select_action(x).cpu().detach().numpy()[0] action_s = agent.s_policy.select_action(x).cpu().detach().numpy()[0] # print(int(action_c), " ", int(action_s)) state_new, reward, done, info = env.step(int(action_c), int(action_s)) agent.update_current_channel(state_new) reward_sum += reward agent.c_policy.rewards.append(reward) agent.s_policy.rewards.append(reward) if done: # tracking log running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01 print('REINFORCE ep %03d done. reward: %f. reward running mean: %f' % (i_episode, reward_sum, running_reward)) if i_episode % default_config["log_freq"] == 0: with open(filename, 'a') as file_object: file_object.write('REINFORCE ep %d done. reward: %f. reward running mean: %f\n' % (i_episode, reward_sum, running_reward)) file_object.close()
def actor_learner_thread(self, env, thread_id, num_actions): # create instance of Doom environment env = Env(env, FLAGS.width, FLAGS.height, FLAGS.history_length, FLAGS.game_type) # Initialize network gradients states = [] actions = [] targets = [] initial_epsilon = 1 epsilon = 1 final_epsilon = self.sample_final_epsilon() print('Starting thread ' + str(thread_id) + ' with final epsilon ' + str(final_epsilon)) time.sleep(3 * thread_id) t = 0 while self.T < self.TMAX: # Get initial game observation state = env.get_initial_state() done = False # episode's counter episode_reward = 0 mean_q = 0 frames = 0 while not done: # forward pass of network. Get Q(s,a) q_values = self.q_values.eval(session=self.session, feed_dict={self.state: [state]}) # define list of actions. All values are zeros except , the # value of action that is executed action_list = np.zeros([num_actions]) action_index = 0 # chose action based on current policy if random.random() <= epsilon: action_index = random.randrange(num_actions) else: action_index = np.argmax(q_values) action_list[action_index] = 1 # reduce epsilon if epsilon > final_epsilon: epsilon -= (initial_epsilon - final_epsilon) / FLAGS.anneal_epsilon_timesteps # decrease learning rate if self.lr > 0: self.lr -= FLAGS.learning_rate / self.TMAX # Gym excecutes action in game environment on behalf of actor-learner new_state, reward, done = env.step(action_index) # forward pass of target network. Get Q(s',a) target_q_values = self.target_q_values.eval(session=self.session, feed_dict={self.new_state: [new_state]}) # clip reward to -1, 1 clipped_reward = np.clip(reward, -1, 1) # compute targets based on Q-learning update rule # targets = r + gamma*max(Q(s',a)) if done: targets.append(clipped_reward) else: targets.append(clipped_reward + FLAGS.gamma * np.max(target_q_values)) actions.append(action_list) states.append(state) # Update the state and global counters state = new_state self.T += 1 t += 1 # update episode's counter frames += 1 episode_reward += reward mean_q += np.max(q_values) # update_target_network if self.T % FLAGS.target_network_update_frequency == 0: self.session.run(self.update_target) # train online network if t % FLAGS.network_update_frequency == 0 or done: if states: self.session.run(self.grad_update, feed_dict={self.state: states, self.actions: actions, self.targets: targets, self.learning_rate: self.lr}) # Clear gradients states = [] actions = [] targets = [] # Save model progress if t % FLAGS.checkpoint_interval == 0: if FLAGS.game_type == 'Doom': self.saver.save(self.session, FLAGS.checkpoint_dir + "/" + FLAGS.game.split("/")[1] + ".ckpt", global_step=t) else: self.saver.save(self.session, FLAGS.checkpoint_dir + "/" + FLAGS.game + ".ckpt", global_step=t) # Print end of episode stats if done: print("THREAD:", thread_id, "/ TIME", self.T, "/ TIMESTEP", t, "/ EPSILON", epsilon, "/ REWARD", episode_reward, "/ Q_MAX %.4f" % (mean_q / float(frames)), "/ EPSILON PROGRESS", t / float(FLAGS.anneal_epsilon_timesteps)) break
N = 20 env = Env(dt=np.pi / N) RL = PolicyGradient( n_actions=env.n_actions, n_features=env.n_states, learning_rate=0.002, reward_decay=0.99, ) fid_10 = 0 ep_max = 500 for episode in range(ep_max): observation = env.reset() for ii in range(N): action = RL.choose_action(observation) observation_, reward, done, fid = env.step(action) RL.store_transition(observation, action, reward) observation = observation_ if done: if episode >= ep_max - 11: fid_10 = max(fid_10, fid) break RL.learn() print('Final_fidelity=', fid_10)
def play_episode(self, n_tot): self.model.eval() env = Env() n_human = 120 humans_trajectories = iter(self.data) softmax = torch.nn.Softmax() # mask = torch.FloatTensor(consts.actions_mask[args.game]) # mask = Variable(mask.cuda(), requires_grad=False) vsx = torch.FloatTensor(consts.short_bins[args.game]) vlx = torch.FloatTensor(consts.long_bins[args.game]) for i in range(n_tot): env.reset() observation = next(humans_trajectories) trajectory = self.data[observation] choices = np.arange(self.global_action_space, dtype=np.int) j = 0 while not env.t: s = Variable(env.s.cuda(), requires_grad=False) vs, vl, beta, qs, ql, phi, pi_s, pi_l, pi_s_tau, pi_l_tau = self.model( s, self.actions_matrix) beta = beta.squeeze(0) pi_l = pi_l.squeeze(0) pi_s = pi_s.squeeze(0) pi_l_tau = pi_l_tau.squeeze(0) pi_s_tau = pi_s_tau.squeeze(0) temp = 1 # consider only 3 most frequent actions beta_np = beta.data.cpu().numpy() indices = np.argsort(beta_np) maskb = Variable(torch.FloatTensor( [0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), requires_grad=False).cuda() # maskb = Variable(torch.FloatTensor([0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), # requires_grad=False).cuda() # pi = maskb * (beta / beta.max()) pi = beta self.greedy = False beta_prob = pi if j < n_human: a = trajectory[j, self.meta['action']] else: eps = np.random.rand() # a = np.random.choice(choices) if self.greedy and eps > 0.1: a = pi.data.cpu().numpy() a = np.argmax(a) else: a = softmax(pi / temp).data.cpu().numpy() a = np.random.choice(choices, p=a) env.step(a) vs = softmax(vs) vl = softmax(vl) vs = torch.sum(vsx * vs.data.cpu()) vl = torch.sum(vlx * vl.data.cpu()) yield { 'o': env.s.cpu().numpy(), 'vs': np.array([vs]), 'vl': np.array([vl]), 's': phi.data.cpu().numpy(), 'score': env.score, 'beta': beta_prob.data.cpu().numpy(), 'phi': phi.squeeze(0).data.cpu().numpy(), 'qs': qs.squeeze(0).data.cpu().numpy(), 'ql': ql.squeeze(0).data.cpu().numpy(), } j += 1 raise StopIteration
global_step = 0 scores, episodes = [], [] for e in range(EPISODES): done = False score = 0 # fresh env state = env.reset() state = np.reshape(state, [1, 15]) while not done: global_step += 1 # get action for the current state and go one step in environment action = agent.get_action(state) next_state, reward, done = env.step(action) next_state = np.reshape(next_state, [1, 15]) agent.append_sample(state, action, reward) score += reward state = copy.deepcopy(next_state) if done: # update policy neural network for each episode agent.train_model() scores.append(score) episodes.append(e) score = round(score, 2) print("episode:", e, " score:", score, " time_step:", global_step)
for i in range(num_episodes): print("Episode {} of {}".format(i + 1, num_episodes)) eps *= decay_factor r_sum = 0 done = False diag_action = 0 diag_reward = 0 state = env.reset((i, num_episodes)) while not done: env.reset((i, num_episodes)) rand = np.random.random() if rand < eps: action = np.random.randint(0, 2) else: action = np.argmax(model.predict(np.identity(10)[state:state + 1])) new_s, r, done, _ = env.step(action=action, num=(i, num_episodes)) target = r + y * np.max(model.predict( np.identity(10)[new_s:new_s + 1])) target_vec = model.predict(np.identity(10)[state:state + 1])[0] target_vec[action] = target model.fit(np.identity(10)[state:state + 1], target_vec.reshape(-1, 2), epochs=1, verbose=0) state = new_s r_sum += r print('Action: {}, Reward: {}'.format(action, r)) file.write('Action: {}, Reward: {}'.format(action, round(r, 2))) diag_action += action diag_reward += r r_avg_list.append(r_sum)
# add.append(i) action = [] for dim in range(2): action.append(int(env.cars_posit[dic_state[2][num][dim][3]])) dic_action[2].append(action) draw_action = [0 for l in range(len(env.cars_posit))] for x in dic_state: for num in range(len(dic_state[x])): for dim in range(len(dic_state[x][num])): draw_action[dic_state[x][num][dim][3]] = dic_action[x][num][dim] draw.piant(env.cars_posit,env.road_range,ax1,env.frame_slot,draw_action) dic_state_, dic_reward = env.step(dic_action, tools) print(dic_reward) for x in dic_reward: for num in range(len(dic_reward[x])): for dim in range(x): suss += dic_reward[x][num][dim] total += env.beam_slot print('成功率',suss/total) dic_state = dic_state_ success += suss totally += total zongzhou.append(success/totally)
for i_episode in range(default_config["max_iteration"]): attack_mode = random.randint(0, 6) state_new = env.reset(attack_mode) agent.update_current_channel(state_new) done = False for t in range(default_config["max_episode_length"]): # Get current channel x = np.zeros(default_config["max_channel"]) x[agent.cur_channel] = 1 # Put into the NN action_c = agent.c_policy.select_action(x).cpu().detach().numpy()[0] action_s = agent.s_policy.select_action(x).cpu().detach().numpy()[0] state_new, reward, done, info = env.step(action_c, action_s) agent.update_current_channel(state_new) reward_sum += reward agent.c_policy.rewards.append(reward) agent.s_policy.rewards.append(reward) if done: # tracking log running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01 print('REINFORCE POLICY GRADIENT WITH BASELINE ep %03d done. reward: %f. reward running mean: %f' % (i_episode, reward_sum, running_reward)) if i_episode % default_config["log_freq"] == 0: with open(filename, 'a') as file_object: file_object.write('REINFORCE POLICY GRADIENT WITH BASELINE ep %d done. reward: %f. reward running mean: %f\n' % (i_episode, reward_sum, running_reward)) file_object.close()
for e in range(EPISODES): done = False score = 0 state = env.reset() state = np.reshape(state, [1, 20]) while not done: # fresh env if agent.render: env.render() global_step += 1 # get action for the current state and go one step in environment action = agent.get_action(state) next_state, reward, done = env.step(action) next_state = np.reshape(next_state, [1, 20]) agent.replay_memory(state, action, reward, next_state, done) # every time step we do training agent.train_replay() score += reward state = copy.deepcopy(next_state) # every 100 time steps update the target model to be same with model if global_step % 100 == 0: agent.update_target_model() if done: scores.append(score)
def play(self, n_tot, action_offset, player): self.beta_net.eval() self.beta_target.eval() self.pi_net.eval() self.pi_target.eval() self.vb_net.eval() self.vb_target.eval() self.q_net.eval() self.q_target.eval() self.qb_net.eval() self.qb_target.eval() env = Env(action_offset) n_human = 90 episodes = list(self.data.keys()) random.shuffle(episodes) humans_trajectories = iter(episodes) for i in range(n_tot): env.reset() trajectory = self.data[next(humans_trajectories)] choices = np.arange(self.global_action_space, dtype=np.int) random_choices = self.mask_q.data.cpu().numpy() random_choices = random_choices / random_choices.sum() j = 0 while not env.t: s = Variable(env.s.cuda(), requires_grad=False) if player is 'beta': pi, _ = self.beta_net(s) pi = pi.squeeze(0) self.greedy = False elif player is 'q_b': pi, _ = self.qb_net(s) pi = pi.squeeze(0) self.greedy = True elif player is 'pi': pi, _ = self.pi_net(s) pi = pi.squeeze(0) self.greedy = False elif player is 'q_pi': pi, _ = self.q_net(s) pi = pi.squeeze(0) self.greedy = True else: raise NotImplementedError if j < n_human: a = trajectory[j, self.meta['action']] else: eps = np.random.rand() # eps = 1 # a = np.random.choice(choices) if self.greedy: if eps > 0.01: a = (pi * self.mask_q).data.cpu().numpy() a = np.argmax(a) else: a = np.random.choice(choices, p=random_choices) else: a = F.softmax(pi + self.mask_beta, dim=0).data.cpu().numpy() a = np.random.choice(choices, p=a) env.step(a) j += 1 yield {'score': env.score, 'frames': j} raise StopIteration