def play_deterministic(self, n_tot): self.model.eval() env = Env() render = args.render n_human = 60 humans_trajectories = iter(self.data) reverse_excitation_index = consts.reverse_excitation_index for i in range(n_tot): env.reset() observation = next(humans_trajectories) print("Observation %s" % observation) trajectory = self.data[observation] j = 0 ims = [] # fig = plt.figure() while not env.t: if j < n_human: a = trajectory[j, self.meta['action']] else: # im = plt.imshow(np.rollaxis(env.s.numpy().squeeze(0)[:3], 0, 3), animated=True) # ims.append([im]) if self.cuda: s = Variable(env.s.cuda(), requires_grad=False) else: s = Variable(env.s, requires_grad=False) _, _, beta, _, _, _ = self.model(s) beta = beta.squeeze(0) beta = beta.sign().int() * (beta.abs() > 0.5).int() a = reverse_excitation_index[tuple(beta.data)] env.step(a) j += 1 # if render: # ani = animation.ArtistAnimation(fig, ims, interval=10, blit=True, # repeat=False) # plt.show() yield env.score
def play_episode_deterministic(self, n_tot): self.model.eval() env = Env() n_human = 300 humans_trajectories = iter(self.data) reverse_excitation_index = consts.reverse_excitation_index for i in range(n_tot): env.reset() observation = next(humans_trajectories) trajectory = self.data[observation] j = 0 while not env.t: s = Variable(env.s.cuda(), requires_grad=False) v, q, beta, r, p, phi = self.model(s) beta = beta.squeeze(0) if j < n_human: a = trajectory[j, self.meta['action']] else: beta_index = (beta.sign().int() * (beta.abs() > 0.5).int()).data.cpu().numpy() beta_index[0] = abs(beta_index[0]) a = reverse_excitation_index[tuple(beta_index.data)] env.step(a) # x = phi.squeeze(0).data.cpu().numpy() # print(np.mean(abs(x))) # yield v, q, beta, r, p, s yield { 'o': env.s.cpu().numpy(), 'v': v.data.cpu().numpy(), 's': phi.data.cpu().numpy(), 'score': env.score, 'beta': beta.data.cpu().numpy(), 'phi': phi.squeeze(0).data.cpu().numpy() } j += 1 raise StopIteration
def experiment1_test( output_folder, word_vectors, agent, episode_index, testset_path='./dataset/conll2003/en/eng.testb', ): # 初始化环境 env = Env(testset_path, word_vectors) step = 0 s = env.reset() print('[' + util.now_time() + "] start testing...") while True: # check task is ended if env.end(): print('[' + util.now_time() + "] testing...done") result_file = '%03d_episode_test.txt' % (episode_index + 1) env.save_all_newlines_to_file(output_folder, result_file) return evaluate.conlleval(output_folder, result_file) # Choose Action a a = agent.choose_action(s) # Execute action s_, r = env.step(a) # Next status step += 1 s = s_
def cartpole(): env = Env('localhost:32822') env.make(ENV_NAME) score_logger = ScoreLogger(ENV_NAME) observation_space = env.observation_space.box.shape[0] action_space = env.action_space.discrete.n dqn_solver = DQNSolver(observation_space, action_space) run = 0 while True: run += 1 state = env.reset() # print(state) state = np.reshape(state, [1, observation_space]) step = 0 while True: step += 1 # env.render() print("acting on state: ", state) action = dqn_solver.act(state) state_next, reward, terminal, info = env.step(action) reward = reward if not terminal else -reward state_next = np.reshape(state_next, [1, observation_space]) dqn_solver.remember(state, action, reward, state_next, terminal) state = state_next if terminal: print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step)) score_logger.add_score(step, run) plt.plot(dqn_solver.loss) plt.title('Model loss') plt.ylabel('Loss') plt.xlabel('Episode') plt.savefig("loss.png") break dqn_solver.experience_replay()
def play_deterministic(self, n_tot): self.model.eval() env = Env() render = args.render n_human = 60 humans_trajectories = iter(self.data) reverse_excitation_index = consts.reverse_excitation_index for i in range(n_tot): env.reset() observation = next(humans_trajectories) print("Observation %s" % observation) trajectory = self.data[observation] j = 0 while not env.t: if j < n_human: a = trajectory[j, self.meta['action']] else: if self.cuda: s = Variable(env.s.cuda(), requires_grad=False) else: s = Variable(env.s, requires_grad=False) _, _, beta, _, _, _ = self.model(s) beta = beta.squeeze(0) beta = (beta.sign().int() * (beta.abs() > 0.5).int()).data if self.cuda: beta = beta.cpu().numpy() else: beta = beta.numpy() beta[0] = abs(beta[0]) a = reverse_excitation_index[tuple(beta)] env.step(a) j += 1 yield {'o': env.s.cpu().numpy(), 'score': env.score}
def experiment1_train( output_folder, word_vectors, n_episodes=300, trainset_path='./dataset/conll2003/en/eng.train', ): # 初始化环境 print('[' + util.now_time() + "] init environment...") env = Env(trainset_path, word_vectors) print('[' + util.now_time() + "] 环境初始化完毕") # 初始化DQN print('[' + util.now_time() + "] init agent...") agent = DQN(n_actions=env.n_actions, status_dim=env.status_dim, action_dim=env.action_dim, reward_dim=env.reward_dim) print('[' + util.now_time() + "] agent初始化完毕") # 迭代episodes for i in range(n_episodes): print('[' + util.now_time() + "] start episode %03d of learning..." % (i + 1)) step = 0 s = env.reset() while True: # check task is ended if env.end(): print('[' + util.now_time() + "] episode %03d of learning...done" % (i + 1)) result_file = '%03d_episode_train.txt' % (i + 1) env.save_all_newlines_to_file(output_folder, result_file) train_eval = evaluate.conlleval(output_folder, result_file) test_eval = experiment1_test(output_folder, word_vectors, agent, i) break # Choose Action a a = agent.choose_action(s) # Execute action # print('step %d' % step) s_, r = env.step(a) agent.store_transition(s, a, r, s_) step += 1 s = s_ if step > 200 and step % 5 == 0: agent.learn() # plot and compare train and test set TODO # plot(train_evals,test_evals) agent.eval_network.save(output_folder + os.path.sep + 'ex1_eval_model', overwrite=True)
class CarEnvironment(Environment): def __init__(self): self.action = [0.0, 0.0] self.delay = False self.grid = Grid() self.env = Env(self.grid) self.reset() def step(self): # Simulate a step in the environment self.agent.brain.stored_action = self.action self.env.tick() self.env.calculate_moves() self.env.do_moves() self.env.print_env(self.env.tick_number) def reset(self): self.env.reset() self.agent = self.env.cars[0] # (x, y, vel, deg, goal_x, goal_y, dist_to_goal, dir_of_goal, direction_diff) self.sensors = self.agent.brain.get_state_tuple() self.distance_to_goal = self.agent.brain.distance_to_goal() def getSensors(self): # (x, y, vel, deg, goal_x, goal_y, dist_to_goal, dir_of_goal, direction_diff) return self.agent.brain.get_state_tuple() def getCarState(self): return self.agent.brain.get_state() def in_goal_state(self): return self.agent.brain.get_state().reached_goal def performAction(self, action): self.action = action self.step() def indim(self): return 2 def outdim(self): return len(self.getSensors())
def run_exp(cfg=None): logger = Logger(cfg) agent = DQNAgent(cfg) env = Env(cfg) trainer = Trainer(env, agent, cfg) cfg = cfg.exp n_training_steps = cfg.n_episodes // cfg.train_after global_step = 0 state = env.reset() joint_angles = np.empty(cfg.n_episodes) for step in range(cfg.n_episodes): state = trainer.single_step(state) # agent training if global_step % cfg.train_after == (cfg.train_after - 1): print(f"step: {step}") print("Training agents") # fw model warmup phase of 2000 steps metrics_dict = agent.train( cfg.train_iv, cfg.train_fw, cfg.train_policy if global_step >= 0 else False) logger.log_metrics(metrics_dict, global_step) logger.log_all_network_weights(agent.joint_agents[0], step) agent.decrease_eps(n_training_steps) # video logging if global_step % cfg.video_after == 0: print("logging video") vis, debug0, debug1 = trainer.record_frames(debug_cams=True) logger.log_vid_debug_cams(vis, debug0, debug1, global_step) # distractor toggling if global_step % cfg.toggle_table_after == (cfg.toggle_table_after - 1): env.toggle_table() global_step += 1 pos = env.get_joint_positions()[0] joint_angles[step] = pos joint_angles = np.degrees(-joint_angles) plt.hist(joint_angles, bins=20, range=(0, 170)) plt.savefig(os.path.join("plots", "explored_angles.png"))
for station in range(10): target_station_id = np.random.randint(num_stations) dests.append(mapo[target_station_id]) # get real station ID arriveTimes = [np.random.randint(12) for _ in range(10)] # cal. incentive preds = np.array(env.calIncentive(dests, arriveTimes, infos)) max_index = np.random.choice(np.flatnonzero( preds == preds.max())) # randomly pick max value's index dest = dests[max_index] # rent bike # get a starting point if env.bikes.count(0) == num_stations: env.reset(num_stations, num_bikes_per_station) source = None while True: target_station_id = np.random.randint(num_stations) if env.bikes[target_station_id] == 0: continue else: source = mapo[target_station_id] break res = agent.rentBike(source, dest, i, arriveTimes[max_index], args) if res is False: bankrupts.append(agent_idx) else: # cal. number of bikes
def main(): rospy.init_node('ddpg_stage_1') env = Env(is_training) agent = DDPG(env, state_dim, action_dim) # import ipdb # ipdb.set_trace() past_action = np.array([0., 0.]) print('State Dimensions: ' + str(state_dim)) print('Action Dimensions: ' + str(action_dim)) print('Action Max: ' + str(action_linear_max) + ' m/s and ' + str(action_angular_max) + ' rad/s') print('Action Min: ' + str(action_linear_min) + ' m/s and ' + str(action_angular_min) + ' rad/s') ######################################################################################### # Training ######################################################################################### if is_training: print('Training mode') avg_reward_his = [] total_reward = 0 action_var = 0.2 success_rate = 0 # Log path setting now = datetime.datetime.now() logdir = now.strftime('%Y-%M-%d') + '_' + now.strftime('%H-%M') logdir = os.path.join(log_dir, logdir) # tb_writer = SummaryWriter(logdir) # Start training start_time = time.time() for itr in range(10000): state = env.reset() # episode_reward = 0.0 # For each episode for cur_step in range(max_episode_length): action = agent.action(state) action[0] = np.clip(np.random.normal(action[0], action_var), action_linear_min, action_linear_max) action[1] = np.clip(np.random.normal(action[1], action_var), action_angular_min, action_angular_max) state_, reward, done, arrive = env.step(action, past_action) time_step = agent.perceive(state, action, reward, state_, done) ######################################################################################## # debugging environment ######################################################################################## if is_debugging: print('cur_step: {}'.format(cur_step)) print('action: {}'.format(action)) print('goal position: x:{}, y:{}'.format( env.goal_position.position.x, env.goal_position.position.y)) print('r: {}, done: {}, arrive: {}'.format( reward, done, arrive)) ######################################################################################## result = 'Success' if arrive else 'Fail' if time_step > 0: total_reward += reward if time_step % 10000 == 0 and time_step > 0: print( '---------------------------------------------------') avg_reward = total_reward / 10000 print('Average_reward: {}'.format(avg_reward)) avg_reward_his.append(round(avg_reward, 2)) # writer.add_scalar('avg_reward', avg_reward, time_step) print('Overall average Reward: {}'.format(avg_reward_his)) total_reward = 0 if time_step % 5 == 0 and time_step > exploration_decay_start_step: action_var *= 0.9999 past_action = action state = state_ if arrive or done or cur_step >= max_episode_length: if result == 'Success': success_rate += 1 sec = time.time() - start_time elapsed_time = str( datetime.timedelta(seconds=sec)).split('.')[0] print( 'Num_episode: {}, Full steps: {}, Result: {}, Elapsed time: {}' .format(itr, cur_step, result, elapsed_time)) if itr % 20 == 0 and itr > 0: print('Total: {}/20, Success rate: {}'.format( success_rate, round(success_rate / 20), 2)) success_rate = 0 break ######################################################################################### # Testing ######################################################################################### else: print('Testing mode') while True: state = env.reset() one_round_step = 0 while True: a = agent.action(state) a[0] = np.clip(a[0], 0., 1.) a[1] = np.clip(a[1], -0.5, 0.5) state_, reward, done, arrive = env.step(a, past_action) past_action = a state = state_ one_round_step += 1 if arrive: print('Step: %3i' % one_round_step, '| Arrive!!!') one_round_step = 0 if done: print('Step: %3i' % one_round_step, '| Collision!!!') break
def play_episode(self, n_tot): self.beta_net.eval() self.beta_target.eval() self.pi_net.eval() self.pi_target.eval() self.vb_net.eval() self.vb_target.eval() self.q_net.eval() self.q_target.eval() self.qb_net.eval() self.qb_target.eval() env = Env() n_human = 120 humans_trajectories = iter(self.data) softmax = torch.nn.Softmax() for i in range(n_tot): env.reset() observation = next(humans_trajectories) trajectory = self.data[observation] choices = np.arange(self.global_action_space, dtype=np.int) mask = Variable(torch.FloatTensor( [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0]), requires_grad=False).cuda() j = 0 temp = 1 while not env.t: s = Variable(env.s.cuda(), requires_grad=False) beta, phi = self.beta_net(s) pi, _ = self.pi_net(s) q, _ = self.q_net(s) vb, _ = self.vb_net(s) pi = beta.squeeze(0) self.greedy = False if j < n_human: a = trajectory[j, self.meta['action']] else: # eps = np.random.rand() eps = 1 # a = np.random.choice(choices) if self.greedy and eps > 0.01: a = pi.data.cpu().numpy() a = np.argmax(a) else: a = softmax(pi / temp).data.cpu().numpy() a = np.random.choice(choices, p=a) q = q[0, a] q = q.squeeze(0) env.step(a) yield { 'o': env.s.cpu().numpy(), 'v': vb.squeeze(0).data.cpu().numpy(), 'vb': vb.squeeze(0).data.cpu().numpy(), 'qb': q.squeeze(0).data.cpu().numpy(), # 's': x[0, :512].data.cpu().numpy(), 'score': env.score, 'beta': pi.data.cpu().numpy(), 'phi': phi.squeeze(0).data.cpu().numpy(), 'q': q.squeeze(0).data.cpu().numpy() } j += 1 raise StopIteration
def play(self, n_tot, action_offset, player): self.beta_net.eval() self.beta_target.eval() self.pi_net.eval() self.pi_target.eval() self.vb_net.eval() self.vb_target.eval() self.q_net.eval() self.q_target.eval() self.qb_net.eval() self.qb_target.eval() env = Env(action_offset) n_human = 90 episodes = list(self.data.keys()) random.shuffle(episodes) humans_trajectories = iter(episodes) for i in range(n_tot): env.reset() trajectory = self.data[next(humans_trajectories)] choices = np.arange(self.global_action_space, dtype=np.int) random_choices = self.mask_q.data.cpu().numpy() random_choices = random_choices / random_choices.sum() j = 0 while not env.t: s = Variable(env.s.cuda(), requires_grad=False) if player is 'beta': pi, _ = self.beta_net(s) pi = pi.squeeze(0) self.greedy = False elif player is 'q_b': pi, _ = self.qb_net(s) pi = pi.squeeze(0) self.greedy = True elif player is 'pi': pi, _ = self.pi_net(s) pi = pi.squeeze(0) self.greedy = False elif player is 'q_pi': pi, _ = self.q_net(s) pi = pi.squeeze(0) self.greedy = True else: raise NotImplementedError if j < n_human: a = trajectory[j, self.meta['action']] else: eps = np.random.rand() # eps = 1 # a = np.random.choice(choices) if self.greedy: if eps > 0.01: a = (pi * self.mask_q).data.cpu().numpy() a = np.argmax(a) else: a = np.random.choice(choices, p=random_choices) else: a = F.softmax(pi + self.mask_beta, dim=0).data.cpu().numpy() a = np.random.choice(choices, p=a) env.step(a) j += 1 yield {'score': env.score, 'frames': j} raise StopIteration
def play_episode(self, n_tot): self.model.eval() env = Env() n_human = 120 humans_trajectories = iter(self.data) softmax = torch.nn.Softmax() mask = torch.FloatTensor(consts.actions_mask[args.game]) mask = Variable(mask.cuda(), requires_grad=False) # self.actions_matrix = torch.FloatTensor([[0, 0, 0], [1, 0, 0],[0, 1, 0], [0, 0, 1]]) for i in range(n_tot): env.reset() observation = next(humans_trajectories) trajectory = self.data[observation] choices = np.arange(self.global_action_space, dtype=np.int) j = 0 while not env.t: s = Variable(env.s.cuda(), requires_grad=False) v, q, beta, _, _, phi = self.model(s, self.actions_matrix) beta = beta.squeeze(0) q = q.squeeze(2) q = q.squeeze(0) q = q * mask # beta[0] = 0 temp = 0.1 if True: # self.imitation: # consider only 3 most frequent actions beta_np = beta.data.cpu().numpy() indices = np.argsort(beta_np) # maskb = Variable(torch.FloatTensor([i in indices[14:18] for i in range(18)]), requires_grad=False) # maskb = Variable(torch.FloatTensor([1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), requires_grad=False) # maskb = maskb.cuda() # pi = maskb * (q / q.max()) maskb = Variable(torch.FloatTensor( [1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), requires_grad=False) maskb = maskb.cuda() pi = maskb * (beta / beta.max()) # pi = maskb * (q / q.max()) self.greedy = False # if j%2: # pi = maskb * (q / q.max()) # self.greedy = True # else: # self.greedy = False # pi = maskb * (beta / beta.max()) # pi = (beta > 3).float() * (q / q.max()) # pi = beta # (beta > 5).float() * (q / q.max()) # pi[0] = 0 # beta_prob = softmax(pi) beta_prob = pi else: pi = q / q.max() # q.max() is the temperature beta_prob = q if j < n_human: a = trajectory[j, self.meta['action']] else: # a = np.random.choice(choices) if self.greedy: a = pi.data.cpu().numpy() a = np.argmax(a) else: a = softmax(pi / temp).data.cpu().numpy() a = np.random.choice(choices, p=a) env.step(a) # x = phi.squeeze(0).data.cpu().numpy() # print(np.mean(abs(x))) # yield v, q, beta, r, p, s yield { 'o': env.s.cpu().numpy(), 'v': v.data.cpu().numpy(), 's': phi.data.cpu().numpy(), 'score': env.score, 'beta': beta_prob.data.cpu().numpy(), 'phi': phi.squeeze(0).data.cpu().numpy() } j += 1 raise StopIteration
""" All information on README.md """ import tensorflow as tf from environment import Env import numpy as np import time import model steps = 1000 env = Env(vision=True) ob = env.reset(relaunch=True) print(ob) ###=================== Play the game with the trained model # while True: # env = Env(vision=True) # ob = env.reset(relaunch=True) # loss = 0.0 # for i in range(steps): # image = scipy.misc.imresize(ob, [66, 200]) / 255.0 # degrees = model.y.eval(feed_dict={model.x: [image], model.keep_prob: 1.0})[0][0] # ob, reward, done, _ = env.step(act) # if done is True: # break # else: # ob_list.append(ob) # # print("PLAY WITH THE TRAINED MODEL") # print(reward_sum)
self.optimizer([self.states, self.actions, discounted_rewards]) self.states, self.actions, self.rewards = [], [], [] if __name__ == "__main__": env = Env() agent = ReinforceAgent() global_step = 0 scores, episodes = [], [] for e in range(EPISODES): done = False score = 0 # fresh env state = env.reset() state = np.reshape(state, [1, 15]) while not done: global_step += 1 # get action for the current state and go one step in environment action = agent.get_action(state) next_state, reward, done = env.step(action) next_state = np.reshape(next_state, [1, 15]) agent.append_sample(state, action, reward) score += reward state = copy.deepcopy(next_state) if done: # update policy neural network for each episode
def main(): expert_demo = pickle.load(open('./Expert dataset 1/expert_20x20_1.p', "rb")) demonstrations = np.array(expert_demo[0]) print("demonstrations.shape", demonstrations.shape) print(expert_demo[1]) print(expert_demo[0]) print(np.array(expert_demo[0]).shape) # expert_x = int(expert_demo[1][0]) # expert_y = int(expert_demo[1][1]) expert_x = int(expert_demo[0][0]) expert_y = int(expert_demo[0][1]) env = Env(expert_x, expert_y) # env.seed(args.seed) # torch.manual_seed(args.seed) num_inputs = 6 num_actions = 8 running_state = ZFilter((num_inputs,), clip=5) print('state size:', num_inputs) print('action size:', num_actions) actor = Actor(num_inputs, num_actions, args) critic = Critic(num_inputs, args) vdb = VDB(num_inputs + num_actions, args) actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate) critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, weight_decay=args.l2_rate) vdb_optim = optim.Adam(vdb.parameters(), lr=args.learning_rate) # load demonstrations k = 1 writer = SummaryWriter(args.logdir) if args.load_model is not None: saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) actor.load_state_dict(ckpt['actor']) critic.load_state_dict(ckpt['critic']) vdb.load_state_dict(ckpt['vdb']) running_state.rs.n = ckpt['z_filter_n'] running_state.rs.mean = ckpt['z_filter_m'] running_state.rs.sum_square = ckpt['z_filter_s'] print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n)) episodes = 0 train_discrim_flag = True for iter in range(args.max_iter_num): # expert_demo = pickle.load(open('./paper/{}.p'.format((iter+1)%expert_sample_size), "rb")) print(iter) expert_demo = pickle.load(open('./Expert dataset 1/expert_20x20_{}.p'.format(np.random.randint(1,50)), "rb")) tmp = expert_demo.pop(-1) demonstrations = np.array(expert_demo) print(demonstrations, demonstrations.shape) tot_sample_size = len(demonstrations) + 10 ########################## actor.eval(), critic.eval() memory = deque() steps = 0 scores = [] # while steps < args.total_sample_size: while steps < tot_sample_size: # env.delete_graph() state = env.reset() # time.sleep(1) score = 0 # state = running_state(state) state1 = state for _ in range((tot_sample_size+1)*2): if args.render: env.render() steps += 1 mu, std = actor(torch.Tensor(state).unsqueeze(0)) action2 = np.argmax(get_action(mu, std)[0]) action = get_action(mu, std)[0] next_state, reward, done, _ = env.step(action2) irl_reward = get_reward(vdb, state, action) # ###### 동영상 촬영용 # if iter > 11500 : # time.sleep(0.015) # ##### if done: mask = 0 else: mask = 1 memory.append([state, action, irl_reward, mask]) # next_state = running_state(next_state) state = next_state score += reward if done: break ########################## env.draw_graph() env.render() ########################## episodes += 1 scores.append(score) score_avg = np.mean(scores) print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg)) writer.add_scalar('log/score', float(score_avg), iter) actor.train(), critic.train(), vdb.train() if train_discrim_flag: expert_acc, learner_acc = train_vdb(vdb, memory, vdb_optim, demonstrations, 0, args) print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100)) if expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen: train_discrim_flag = False train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args) if iter % 100: score_avg = int(score_avg) model_path = os.path.join(os.getcwd(),'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar') save_checkpoint({ 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'vdb': vdb.state_dict(), 'z_filter_n':running_state.rs.n, 'z_filter_m': running_state.rs.mean, 'z_filter_s': running_state.rs.sum_square, 'args': args, 'score': score_avg }, filename=ckpt_path) #### score_avg = int(score_avg) model_path = os.path.join(os.getcwd(), 'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) ckpt_path = os.path.join(model_path, 'ckpt_' + 'last_model' + '.pth.tar') save_checkpoint({ 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'vdb': vdb.state_dict(), 'z_filter_n': running_state.rs.n, 'z_filter_m': running_state.rs.mean, 'z_filter_s': running_state.rs.sum_square, 'args': args, 'score': score_avg }, filename=ckpt_path)
big_cpus, big_disks, big_mems, big_time, big_lifes, big_profits, small_cpus, small_disks, small_mems, small_time, small_lifes, small_profits, federate=True) # Obtain the greedy solution print('### GREEDY SOLUTION without federation ###') env.reset() greedy_profit_no_fed = greedy(env, big_cpus, big_disks, big_mems, big_time, big_lifes, big_profits, small_cpus, small_disks, small_mems, small_time, small_lifes, small_profits, federate=False)
class DDPGStage: def __init__(self, model, is_training=False, var=1.): self.max_step = 200 self.exploration_decay_start_step = 50000 state_dim = 366 action_dim = 2 self.action_linear_max = 0.25 # m/s self.action_angular_max = 0.5 # rad/s rospy.init_node('ddpg_stage_1') rospy.on_shutdown(self.clear_vel) self.is_training = is_training if ['/gazebo/model_states', 'gazebo_msgs/ModelStates'] in rospy.get_published_topics(): self.env = SimEnv(self.is_training) print("Gazebo mode") else: self.env = Env(self.is_training) print("Real world mode") self.agent = DDPG(model, self.env, state_dim, action_dim) self.past_action = np.array([0., 0.]) print('State Dimensions: ' + str(state_dim)) print('Action Dimensions: ' + str(action_dim)) print('Action Max: ' + str(self.action_linear_max) + ' m/s and ' + str(self.action_angular_max) + ' rad/s') self.var = var def _train(self): print('Training mode') avg_reward_his = [] total_reward = 0 while not rospy.is_shutdown(): state = self.env.reset() one_round_step = 0 while not rospy.is_shutdown(): a = self.agent.action(state) a[0] = np.clip(np.random.normal(a[0], self.var), 0., 1.) a[1] = np.clip(np.random.normal(a[1], self.var), -0.5, 0.5) state_, r, collision, arrive = self.env.step(a) time_step = self.agent.perceive(state, a, r, state_, collision) if time_step > 0: total_reward += r if time_step % 10000 == 0 and time_step > 0: print('---------------------------------------------------') avg_reward = total_reward / 10000 print('Average_reward = ', avg_reward) avg_reward_his.append(round(avg_reward, 2)) print('Average Reward:', avg_reward_his) total_reward = 0 if time_step % 5 == 0 and time_step > self.exploration_decay_start_step and self.var > 0.1: self.var *= 0.9999 state = state_ one_round_step += 1 plt.title("STEP %d, Reward: %.2f" % (one_round_step, r)) result = 'Step: %3i | Reward: %.2f | Var: %.2f | Time step: %i |' % (one_round_step, r, self.var, time_step) if arrive: print(result, 'Success') one_round_step = 0 self.env.common_reset() elif collision: print(result, 'Collision') break elif one_round_step >= self.max_step: print(result, 'Failed') break def _evaluate(self): print('Testing mode') self.env.goal_range["x"] = [-1, 1] self.env.goal_range["y"] = [-1, 1] while not rospy.is_shutdown(): state = self.env.reset() one_round_step = 0 while not rospy.is_shutdown(): a = self.agent.action(state) print("action: %s" % a) a[0] = np.clip(a[0], 0., 1.) a[1] = np.clip(a[1], -0.5, 0.5) state_, r, collision, arrive = self.env.step(a) state = state_ one_round_step += 1 plt.title("STEP %d, Reward: %.2f" % (one_round_step, r)) result = 'Step: %3i | Reward: %.2f | Var: %.2f |' % ( one_round_step, r, self.var) if arrive: print(result, 'Success') one_round_step = 0 self.env.common_reset() # input() elif collision: print(result, 'Collision') break elif one_round_step >= self.max_step: print(result, 'Failed') break def run(self): # try: if self.is_training: self._train() else: self._evaluate() self.env.pub_cmd_vel.publish(Twist()) def clear_vel(self): self.env.pub_cmd_vel.publish(Twist())
def play_episode(self, n_tot): self.beta_net.eval() self.pi_net.eval() self.vb_net.eval() self.q_net.eval() self.q_target.eval() self.beta_target.eval() self.vb_target.eval() self.qb_net.eval() env = Env() n_human = 120 humans_trajectories = iter(self.data) softmax = torch.nn.Softmax() for i in range(n_tot): env.reset() observation = next(humans_trajectories) trajectory = self.data[observation] choices = np.arange(self.global_action_space, dtype=np.int) mask = Variable(torch.FloatTensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0]), requires_grad=False).cuda() j = 0 temp = 1 while not env.t: s = Variable(env.s.cuda(), requires_grad=False) beta, phi = self.beta_net(s) pi, _ = self.pi_net(s) q, _ = self.q_net(s) vb, _ = self.vb_net(s) pi = beta.squeeze(0) self.greedy = False if j < n_human: a = trajectory[j, self.meta['action']] else: # eps = np.random.rand() eps = 1 # a = np.random.choice(choices) if self.greedy and eps > 0.025: a = pi.data.cpu().numpy() a = np.argmax(a) else: a = softmax(pi/temp).data.cpu().numpy() a = np.random.choice(choices, p=a) q = q[0, a] q = q.squeeze(0) env.step(a) yield {'o': env.s.cpu().numpy(), 'v': vb.squeeze(0).data.cpu().numpy(), 'vb': vb.squeeze(0).data.cpu().numpy(), 'qb': q.squeeze(0).data.cpu().numpy(), # 's': x[0, :512].data.cpu().numpy(), 'score': env.score, 'beta': pi.data.cpu().numpy(), 'phi': phi.squeeze(0).data.cpu().numpy(), 'q': q.squeeze(0).data.cpu().numpy()} j += 1 raise StopIteration # if self.mc: # # # MC return to boost the learning of Q^{\pi} # loss_q_pi = self.loss_q_pi(q_pi, r_mc) # else: # # # evaluate V^{\pi}(s') # # V^{\pi}(s') = \sum_{a} Q^{\pi}(s',a) \pi(a|s') # pi_target_tag, _ = self.pi_target(s_tag) # beta_target_tag, _ = self.beta_target(s_tag) # beta_sfm_tag = F.softmax(beta_target_tag, 1) # pi_sfm_tag = F.softmax(pi_target_tag, 1) # # consider only common actions # mask_b = (beta_sfm_tag > self.behavioral_threshold).float() # q_pi_tag_target, _ = self.q_target(s_tag) # # v_tag = (q_pi_tag_target * mask_b * pi_sfm_tag).sum(1) # v_tag = v_tag.unsqueeze(1) # v_tag = v_tag.detach() # # loss_q_pi = self.loss_q_pi(q_pi, r + (self.discount ** k) * (v_tag * (1 - t)))
class Training: def __init__(self): self.n_episode = [] self.n_epsilon = [] self.n_dist = [] self.avg_err = [] self.logging_data = [] # Parameters self.n_episodes = rospy.get_param("/n_episodes") self.n_step = rospy.get_param("/n_steps") self.mode_action = rospy.get_param('/mode_action') self.mem_size = rospy.get_param('/mem_size') self.batch_size = rospy.get_param('/batch_size') self.mode_optimize = rospy.get_param('/mode_optimize') self.avg_err_fre = rospy.get_param('/avg_err_fre') self.save_fre = rospy.get_param("/save_fre") self.load_checkpoint = rospy.get_param("/load_checkpoint") # create environment self.env = Env() self.n_states = self.env.observation_space self.n_actions = self.env.action_space.n # create Deep Q-Network self.dqn = DQN(self.n_states, self.n_actions) self.memory = ExperienceReplay(self.mem_size) # plot self.color1 = 'tab:green' self.color2 = 'tab:blue' self.color3 = 'tab:orange' self.color4 = 'tab:red' self.style_plot = random.choice(plt.style.available) plt.style.use(self.style_plot) plt.ion() ########### # Figure 1 - Rewards self.fig1 = plt.figure(1) # fig = plt.figure(figsize=(12,5)) self.ax1 = self.fig1.add_subplot(1, 1, 1) self.ax2 = self.ax1.twinx() title_1 = 'Rewards - (Mode: Training)' self.ax1.set_title(title_1) self.ax1.set_xlabel('Episode') self.ax1.set_ylabel('Reward', color=self.color1) self.ax2.set_ylabel('Epsilon', color=self.color2) self.ax1.tick_params(axis='y', labelcolor=self.color1) self.ax2.tick_params(axis='y', labelcolor=self.color2) ########### # Figure 2 - Error self.fig2 = plt.figure(2) self.ax3 = self.fig2.add_subplot(1, 1, 1) title_2 = 'Error Distance - (Mode: Training)' self.ax3.set_title(title_2) self.ax3.set_xlabel('Episode') self.ax3.set_ylabel('Meter') self.init_file() def moving_average(self, x, w): return np.convolve(x, np.ones(w), 'valid') / w def init_file(self): rospack = rospkg.RosPack() data_path = rospack.get_path("pioneer_dragging") + "/data" username = getpass.getuser() # n_folder = len(os.walk(data_path).__next__()[1]) n_folder = glob("{}/{}*".format(data_path, username)) n_folder = len(n_folder) + 1 if self.load_checkpoint: n_folder -= 1 self.data_path = "{}/{}-{}".format(data_path, username, n_folder) if not os.path.exists(self.data_path): os.mkdir(self.data_path) # config file if not self.load_checkpoint: config_path = rospack.get_path( "pioneer_dragging") + "/config/dragging_params.yaml" config_log = '{}/{}-params.yaml'.format(self.data_path, n_folder) os.system('cp {} {}'.format(config_path, config_log)) plot_style = {'plot_style': self.style_plot} with open(config_log, 'r') as yamlfile: cur_yaml = yaml.safe_load(yamlfile) # Note the safe_load cur_yaml.update(plot_style) if cur_yaml: with open(config_log, 'w') as yamlfile: yaml.safe_dump(cur_yaml, yamlfile) # Also note the safe_dump # history file self.history_log = '{}/{}-log.txt'.format(self.data_path, n_folder) # model file self.dqn.file_models = '{}/{}-pytorch-RL.tar'.format( self.data_path, n_folder) # memory file self.memory.file_mem = '{}/{}-memory.data'.format( self.data_path, n_folder) # figures file self.figure1 = '{}/{}-Rewards(Training).png'.format( self.data_path, n_folder) self.figure2 = '{}/{}-Error(Training).png'.format( self.data_path, n_folder) def plot_result(self, i_episode, cumulated_reward, epsilon, error_dist, loaded=False): ### Figure 1 # plot bar (cumulated reward) self.ax1.bar(i_episode, cumulated_reward, color=self.color1) # plot line (epsilon decay ) if loaded: self.ax2.plot(i_episode, epsilon, color=self.color2) self.n_episode = i_episode.tolist() self.n_epsilon = epsilon.tolist() self.n_dist = error_dist.tolist() else: self.n_episode.append(i_episode) self.n_epsilon.append(epsilon) self.ax2.plot(self.n_episode, self.n_epsilon, color=self.color2) self.n_dist.append(error_dist) ### Figure 2 # plot bar (error distance) self.ax3.bar(i_episode, error_dist, color=self.color3) # window_err = np.array(self.n_dist) # window_err = np.mean(window_err) # self.avg_err.append(window_err) # self.ax3.plot(self.n_episode, self.avg_err, color=self.color4) # plot line (average error distance) if len(self.n_dist) % self.avg_err_fre == 0: avg_err = self.moving_average(np.array(self.n_dist), self.avg_err_fre) self.ax3.plot(avg_err, color=self.color4) plt.draw() plt.pause(0.1) def run(self): start_time = time.time() if self.load_checkpoint: self.memory.load() self.dqn.load_model() # history log loaded self.logging_data = [ line.rstrip('\n') for line in open(self.history_log) ] hist_data = pd.read_csv(self.history_log, sep=",") i_episode = hist_data['i_episode'] cumulated_reward = hist_data['cumulated_reward'] epsilon = hist_data['epsilon'] error_dist = hist_data['error_dist'] self.plot_result(i_episode, cumulated_reward, epsilon, error_dist, loaded=True) i_episode = hist_data['i_episode'].iloc[-1] + 1 self.dqn.epsilon = hist_data['epsilon'].iloc[-1] rospy.loginfo('[RL] Loaded checkpoint') else: i_episode = 0 ######################################### ###### Reinfrocement Training loop ###### for i_episode in range(i_episode, self.n_episodes): state = self.env.reset(i_episode) cumulated_reward = 0 steps = 0 step_time = time.time() while not rospy.is_shutdown(): steps += 1 action, epsilon = self.dqn.select_action(state, i_episode) # print('num_steps: {}, epsilon: {}, steps_done: {}'.format(steps, epsilon, dqn.steps_done)) # action = env.action_space.sample() rospy.loginfo('[RL] action: {}'.format(action)) next_state, reward, done, info = self.env.step(action) self.memory.push(state, action, next_state, reward, done) cumulated_reward += reward ################################ ######### optimize ############# if self.mode_optimize == 'normal_dqn': # without experience replay memory self.dqn.optimize(state, action, next_state, reward, done) elif self.mode_optimize == 'dqn_replay_memory': # with experience replay memory if len(self.memory) > self.batch_size: state_mem, action_mem, next_state_mem, reward_mem, done_mem = self.memory.sample( self.batch_size) self.dqn.optimize_with_replay_memory( state_mem, action_mem, next_state_mem, reward_mem, done_mem) elif self.mode_optimize == 'dqn_taget_net': # with experience target net if len(self.memory) > self.batch_size: state_mem, action_mem, next_state_mem, reward_mem, done_mem = self.memory.sample( self.batch_size) self.dqn.optimize_with_DQN(state_mem, action_mem, next_state_mem, reward_mem, done_mem) elif self.mode_optimize == 'dueling_dqn': # with double DQN if len(self.memory) > self.batch_size: state_mem, action_mem, next_state_mem, reward_mem, done_mem = self.memory.sample( self.batch_size) self.dqn.optimize_with_dueling_DQN( state_mem, action_mem, next_state_mem, reward_mem, done_mem) if not done: state = next_state else: break # DQN update param self.dqn.update_param(i_episode) # Plotting error_dist = self.env.calc_dist() self.plot_result(i_episode, cumulated_reward, epsilon, error_dist) # Save Checkpoint temp_data = "{},{},{},{}".format(i_episode, cumulated_reward, epsilon, error_dist) self.logging_data.append(temp_data) if i_episode % self.save_fre == 0: rospy.loginfo('[RL] Save checkpoint: {}'.format(i_episode)) self.dqn.save_model() # save models self.memory.save() # save replay memory # logging file with open(self.history_log, 'w') as f: if not self.load_checkpoint: f.write( "i_episode,cumulated_reward,epsilon,error_dist\n") for item in self.logging_data: f.write("%s\n" % item) # save figures self.fig1.savefig(self.figure1, dpi=self.fig1.dpi) self.fig2.savefig(self.figure2, dpi=self.fig2.dpi) rospy.loginfo('[RL] Save figure1: {}'.format(self.figure1)) rospy.loginfo('[RL] Save figure2: {}'.format(self.figure2)) # Timing elapsed_time = time.time() - step_time total_time = time.time() - start_time print('\n********') print("Elapsed time: {}".format( time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))) print("Total time: {}".format( time.strftime("%H:%M:%S", time.gmtime(total_time)))) # Finish Training self.env.close() print() rospy.loginfo('[RL] Exit ...') total_time = time.time() - start_time print('\n*********************') print("Total time: ", time.strftime("%H:%M:%S", time.gmtime(total_time))) rospy.loginfo('[RL] Style plot: {}'.format(self.style_plot)) plt.show(block=True)
N = 20 env = Env(dt=np.pi / N) RL = PolicyGradient( n_actions=env.n_actions, n_features=env.n_states, learning_rate=0.002, reward_decay=0.99, ) fid_10 = 0 ep_max = 500 for episode in range(ep_max): observation = env.reset() for ii in range(N): action = RL.choose_action(observation) observation_, reward, done, fid = env.step(action) RL.store_transition(observation, action, reward) observation = observation_ if done: if episode >= ep_max - 11: fid_10 = max(fid_10, fid) break RL.learn()
def play_episode(self, n_tot): self.model.eval() env = Env() n_human = 120 humans_trajectories = iter(self.data) softmax = torch.nn.Softmax() # mask = torch.FloatTensor(consts.actions_mask[args.game]) # mask = Variable(mask.cuda(), requires_grad=False) vsx = torch.FloatTensor(consts.short_bins[args.game]) vlx = torch.FloatTensor(consts.long_bins[args.game]) for i in range(n_tot): env.reset() observation = next(humans_trajectories) trajectory = self.data[observation] choices = np.arange(self.global_action_space, dtype=np.int) j = 0 while not env.t: s = Variable(env.s.cuda(), requires_grad=False) vs, vl, beta, qs, ql, phi, pi_s, pi_l, pi_s_tau, pi_l_tau = self.model( s, self.actions_matrix) beta = beta.squeeze(0) pi_l = pi_l.squeeze(0) pi_s = pi_s.squeeze(0) pi_l_tau = pi_l_tau.squeeze(0) pi_s_tau = pi_s_tau.squeeze(0) temp = 1 # consider only 3 most frequent actions beta_np = beta.data.cpu().numpy() indices = np.argsort(beta_np) maskb = Variable(torch.FloatTensor( [0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), requires_grad=False).cuda() # maskb = Variable(torch.FloatTensor([0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), # requires_grad=False).cuda() # pi = maskb * (beta / beta.max()) pi = beta self.greedy = False beta_prob = pi if j < n_human: a = trajectory[j, self.meta['action']] else: eps = np.random.rand() # a = np.random.choice(choices) if self.greedy and eps > 0.1: a = pi.data.cpu().numpy() a = np.argmax(a) else: a = softmax(pi / temp).data.cpu().numpy() a = np.random.choice(choices, p=a) env.step(a) vs = softmax(vs) vl = softmax(vl) vs = torch.sum(vsx * vs.data.cpu()) vl = torch.sum(vlx * vl.data.cpu()) yield { 'o': env.s.cpu().numpy(), 'vs': np.array([vs]), 'vl': np.array([vl]), 's': phi.data.cpu().numpy(), 'score': env.score, 'beta': beta_prob.data.cpu().numpy(), 'phi': phi.squeeze(0).data.cpu().numpy(), 'qs': qs.squeeze(0).data.cpu().numpy(), 'ql': ql.squeeze(0).data.cpu().numpy(), } j += 1 raise StopIteration
decay_factor = 0.999 num_episodes = 10 r_avg_list = [] r_sum_list = [] file = open('diag.txt', 'w') for i in range(num_episodes): print("Episode {} of {}".format(i + 1, num_episodes)) eps *= decay_factor r_sum = 0 done = False diag_action = 0 diag_reward = 0 state = env.reset((i, num_episodes)) while not done: env.reset((i, num_episodes)) rand = np.random.random() if rand < eps: action = np.random.randint(0, 2) else: action = np.argmax(model.predict(np.identity(10)[state:state + 1])) new_s, r, done, _ = env.step(action=action, num=(i, num_episodes)) target = r + y * np.max(model.predict( np.identity(10)[new_s:new_s + 1])) target_vec = model.predict(np.identity(10)[state:state + 1])[0] target_vec[action] = target model.fit(np.identity(10)[state:state + 1], target_vec.reshape(-1, 2), epochs=1,
episode.append((next_state, action, reward, done)) if len(episode) > 200: # stop episode for time saving return [], False if done: break current_state = next_state return episode, True # main loop if __name__ == "__main__": env = Env() agent = MCAgent(actions=list(range(env.n_actions))) for episode in range(1000): print("Episode : ", episode + 1) current_state = env.reset() # generate episode episode, _ = generate_episode(env, agent) # update value table according to the episode agent.update(episode) # for monitoring values env.print_values(agent.value_table)
if __name__ == "__main__": env = Env() agent = ReinforceAgent() global_step = 0 scores, episodes = [], [] for e in range(EPISODES): done = False score = 0 #agent.model.load_model('qwerty_1.h5') # fresh env #tar = random.sample(range(0,25),1) #target = [int(tar[0]/5),tar[0]%5] img, g_map = env.reset() cv2.imshow('image', img) #state = np.reshape(state, [1, 3]) #img = state[0] #g_map = state[1] img = np.reshape(img, [1, img.shape[0], img.shape[1], img.shape[2]]) g_map = np.reshape(g_map, [1, 5, 5, 1]) #state = [img,g_map] while not done: global_step += 1 # get action for the current state and go one step in environment action = agent.get_action([img, g_map]) next_state, reward, done = env.step(action) img = next_state[0] cv2.imshow('image', img)
import random import math from gazebo_msgs.msg import * import numpy as np import csv import rospkg import matplotlib.pyplot as plt from matplotlib import cm import time from environment import Env if __name__ == "__main__": rospy.init_node("path_controller_node", anonymous=False) env = Env() state_scan = env.reset() action = np.zeros(2) pub = rospy.Publisher('/cmd_vel', Twist, queue_size=10) r = rospy.Rate(5) # 10hz velocity = Twist() while not rospy.is_shutdown(): # FACA SEU CODIGO AQUI if (min(state_scan[:20]) > 0.25): action[0] = .0 action[1] = 0. else: action[0] = 0. action[1] = 0.0 state_scan = env.step(action)
def main(): rospy.init_node('ddpg_stage_1') env = Env(is_training) agent = DDPG(env, state_dim, action_dim) past_action = np.array([0., 0.]) print('State Dimensions: ' + str(state_dim)) print('Action Dimensions: ' + str(action_dim)) print('Action Max: ' + str(action_linear_max) + ' m/s and ' + str(action_angular_max) + ' rad/s') if is_training: print('Training mode') avg_reward_his = [] total_reward = 0 var = 1. while True: state = env.reset() one_round_step = 0 while True: a = agent.action(state) a[0] = np.clip(np.random.normal(a[0], var), 0., 1.) a[1] = np.clip(np.random.normal(a[1], var), -0.5, 0.5) state_, r, done, arrive = env.step(a, past_action) time_step = agent.perceive(state, a, r, state_, done) if arrive: result = 'Success' else: result = 'Fail' if time_step > 0: total_reward += r if time_step % 10000 == 0 and time_step > 0: print( '---------------------------------------------------') avg_reward = total_reward / 10000 print('Average_reward = ', avg_reward) avg_reward_his.append(round(avg_reward, 2)) print('Average Reward:', avg_reward_his) total_reward = 0 if time_step % 5 == 0 and time_step > exploration_decay_start_step: var *= 0.9999 past_action = a state = state_ one_round_step += 1 if arrive: print('Step: %3i' % one_round_step, '| Var: %.2f' % var, '| Time step: %i' % time_step, '|', result) one_round_step = 0 if done or one_round_step >= 500: print('Step: %3i' % one_round_step, '| Var: %.2f' % var, '| Time step: %i' % time_step, '|', result) break else: print('Testing mode') while True: state = env.reset() one_round_step = 0 while True: a = agent.action(state) a[0] = np.clip(a[0], 0., 1.) a[1] = np.clip(a[1], -0.5, 0.5) state_, r, done, arrive = env.step(a, past_action) past_action = a state = state_ one_round_step += 1 if arrive: print('Step: %3i' % one_round_step, '| Arrive!!!') one_round_step = 0 if done: print('Step: %3i' % one_round_step, '| Collision!!!') break
else: print("ERROR IN TEST MODE!") # Main loop if __name__ == '__main__': running_reward = None reward_sum = 0 prev_x = None filename = './data/evaluation_logs.txt' for i_episode in range(default_config["max_iteration"]): attack_mode = random.randint(0, 6) state_new = env.reset(attack_mode) agent.update_current_channel(state_new) done = False for t in range(default_config["max_episode_length"]): # Get current channel x = np.zeros(default_config["max_channel"]) x[agent.cur_channel] = 1 # Put into the NN action_c = agent.c_policy.select_action(x).cpu().detach().numpy()[0] action_s = agent.s_policy.select_action(x).cpu().detach().numpy()[0] # print(int(action_c), " ", int(action_s)) state_new, reward, done, info = env.step(int(action_c), int(action_s)) agent.update_current_channel(state_new) reward_sum += reward
def main(): expert_demo = pickle.load(open('./Ree1_expert.p', "rb")) # Ree1 : action 1 # Ree2 : action 100 # Ree3 : action 50 # Ree4 : action 10 # Ree5 : action 4 # Ree6 : action 0.5 # print('expert_demo_shape : ', np.array(expert_demo).shape) expert_x = int(expert_demo[1][0]) expert_y = int(expert_demo[1][1]) env = Env(expert_x, expert_y) # env = Env(0,0) # env.seed(args.seed) torch.manual_seed(args.seed) num_inputs = 2 num_actions = 8 running_state = ZFilter((num_inputs, ), clip=5) print('state size:', num_inputs) print('action size:', num_actions) actor = Actor(num_inputs, num_actions, args) critic = Critic(num_inputs, args) discrim = Discriminator(num_inputs + num_actions, args) actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate) critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, weight_decay=args.l2_rate) discrim_optim = optim.Adam(discrim.parameters(), lr=args.learning_rate) # load demonstrations # expert_demo, _ = pickle.load(open('./expert_demo/expert_demo.p', "rb")) demonstrations = np.array(expert_demo[0]) # print("demonstrations.shape", demonstrations.shape) writer = SummaryWriter(args.logdir) if args.load_model is not None: saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) actor.load_state_dict(ckpt['actor']) critic.load_state_dict(ckpt['critic']) discrim.load_state_dict(ckpt['discrim']) running_state.rs.n = ckpt['z_filter_n'] running_state.rs.mean = ckpt['z_filter_m'] running_state.rs.sum_square = ckpt['z_filter_s'] print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n)) episodes = 0 train_discrim_flag = True for iter in range(args.max_iter_num): actor.eval(), critic.eval() memory = deque() steps = 0 scores = [] while steps < args.total_sample_size: state = env.reset() score = 0 state = running_state(state) for _ in range(1000): if args.render: env.render() steps += 1 mu, std = actor(torch.Tensor(state).unsqueeze(0)) action2 = np.argmax(get_action(mu, std)[0]) action = get_action(mu, std)[0] next_state, reward, done, _ = env.step(action2) # next_state, reward, done, _ = env.step(action) irl_reward = get_reward(discrim, state, action) if done: mask = 0 else: mask = 1 memory.append([state, action, irl_reward, mask]) next_state = running_state(next_state) state = next_state score += reward if done: break episodes += 1 scores.append(score) score_avg = np.mean(scores) print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg)) writer.add_scalar('log/score', float(score_avg), iter) actor.train(), critic.train(), discrim.train() if train_discrim_flag: expert_acc, learner_acc = train_discrim(discrim, memory, discrim_optim, demonstrations, args) print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100)) temp_learner.append(learner_acc * 100) temp_expert.append(expert_acc * 100) if ((expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen and iter % 55 == 0) or iter % 50 == 0): # train_discrim_flag = False plt.plot(temp_learner, label='learner') plt.plot(temp_expert, label='expert') plt.xlabel('Episode') plt.ylabel('Accuracy') plt.xticks([]) plt.legend() plt.savefig('accuracy{}.png'.format(iter)) # plt.show() model_path = 'C:/Users/USER/9 GAIL/lets-do-irl/mujoco/gail' ckpt_path = os.path.join(model_path, 'ckpt_' + str(score_avg) + '.pth.tar') print("check path", ckpt_path) save_checkpoint( { 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'discrim': discrim.state_dict(), 'z_filter_n': running_state.rs.n, 'z_filter_m': running_state.rs.mean, 'z_filter_s': running_state.rs.sum_square, 'args': args, 'score': score_avg }, filename=ckpt_path) train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args) if iter % 100: score_avg = int(score_avg) model_path = os.path.join(os.getcwd(), 'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) model_path = 'C:/Users/USER/9 GAIL/lets-do-irl/mujoco/gail' ckpt_path = os.path.join(model_path, 'ckpt_' + str(score_avg) + '.pth.tar') save_checkpoint( { 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'discrim': discrim.state_dict(), 'z_filter_n': running_state.rs.n, 'z_filter_m': running_state.rs.mean, 'z_filter_s': running_state.rs.sum_square, 'args': args, 'score': score_avg }, filename=ckpt_path) plt.plot(temp_learner) plt.plot(temp_expert) plt.xlabel('Episode') plt.ylabel('Accuracy') plt.xticks([]) plt.savefig('accuracy.png')
def play_episode(self, n_tot): self.model.eval() self.model_b.eval() env = Env() n_human = 120 humans_trajectories = iter(self.data) softmax = torch.nn.Softmax() for i in range(n_tot): env.reset() observation = next(humans_trajectories) trajectory = self.data[observation] choices = np.arange(self.global_action_space, dtype=np.int) mask = Variable(torch.FloatTensor([0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), requires_grad=False).cuda() j = 0 temp = 1 while not env.t: s = Variable(env.s.cuda(), requires_grad=False) beta, vb, qb, _, _ = self.model_b(s, self.actions_matrix) pi, v, q, adv, x = self.model(s, self.actions_matrix, beta.detach()) pi = pi.squeeze(0) self.greedy = False if j < n_human: a = trajectory[j, self.meta['action']] else: eps = np.random.rand() # a = np.random.choice(choices) if self.greedy and eps > 0.1: a = pi.data.cpu().numpy() a = np.argmax(a) else: a = softmax(pi/temp).data.cpu().numpy() a = np.random.choice(choices, p=a) q = q[0, a, 0] q = q.squeeze(0) qb = qb[0, a, 0] qb = qb.squeeze(0) env.step(a) yield {'o': env.s.cpu().numpy(), 'v': v.squeeze(0).data.cpu().numpy(), 'vb': vb.squeeze(0).data.cpu().numpy(), 'qb': qb.squeeze(0).data.cpu().numpy(), 's': x[0, :512].data.cpu().numpy(), 'score': env.score, 'beta': pi.data.cpu().numpy(), 'phi': x[0, :512].data.cpu().numpy(), 'q': q.squeeze(0).data.cpu().numpy()} j += 1 raise StopIteration
if __name__ == "__main__": # maze game # env = Maze() env = Env() agent = DQNAgent() global_step = 0 # agent.load_model("./save_model/10by10") scores, episodes = [], [] for e in range(EPISODES): done = False score = 0 state = env.reset() state = np.reshape(state, [1, 20]) while not done: # fresh env if agent.render: env.render() global_step += 1 # get action for the current state and go one step in environment action = agent.get_action(state) next_state, reward, done = env.step(action) next_state = np.reshape(next_state, [1, 20]) agent.replay_memory(state, action, reward, next_state, done) # every time step we do training
plt.ion() plt.figure(figsize=(100, 5)) # 设置画布大小 ax1 = plt.subplot(211) ax2 = plt.subplot(212) success = 0 totally = 0 zongzhou = [] while True: # main1.rl.restore_net() # main2.rl.restore_net() dic_state = env.reset(tools) for episodes in range(1000): dic_action = {} suss = 0 total = 0 for x in dic_state: if x not in dic_action: dic_action[x] = [] if x == 1: for num in range(len(dic_state[1])): # temp_state = tools.get_list(dic_state[1][num]) # 车组中所有车辆状态合成 # temp = main1.rl.real_choose_action(temp_state) # 学习到车组的动作组合 dic_action[1].append([int(env.cars_posit[dic_state[1][num][0][3]])])