class AgentTest(unittest.TestCase): def setUp(self): self.json_data = '{"observations": {"screen_features": ["height_map", "player_id", "player_relative", "unit_type"], ' \ '"minimap_features": ["player_id", "selected"], "nonspatial_features": ["player", "score_cumulative"], ' \ '"action_ids": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}, "rewards": [1, 1, 1, 1]}' self.config = json.loads(self.json_data) self.sess = tf.Session() self.agent_modifier = AgentModifier(self.config, 32) self.agent = A2CAgent(self.sess, self.agent_modifier) # self.obs_spec = {} # self._builder = dummy_observation.Builder(self._obs_spec) # self.obs = self._builder.build().observation self.env = Environment() self.obs = self.env.reset() def testMakeAction(self): print("Testing Make Action") action = self.agent.act(self.obs) action_made_1 = self.agent.convert_actions(action) action_2 = self.agent.act(self.obs) self.obs = self.env.reset() action_made_2 = self.agent.convert_actions(action_2) self.assertNotEqual(action_made_1, action_made_2) def testGetObservationFeed(self): print("Testing Get Observation Feed") feed_dict = self.agent._get_observation_feed(self.obs) self.obs = self.env.reset() feed_dict_2 = self.agent._get_observation_feed(self.obs) self.assertNotEqual(feed_dict, feed_dict_2)
class GameManager: def __init__(self, id): self.visualize = False if Config.VISUALIZE and int(id / len(Config.PATH_TO_WORLD)) == 0: self.visualize = True elif Config.PLAY_MODE: self.visualize = True world_name = Config.PATH_TO_WORLD[id % len(Config.PATH_TO_WORLD)] self.env = Environment(world_name) print("Env {} for Agent {} started.".format(world_name, id)) self.env.set_mode(Config.MODE, Config.TERMINATE_AT_END) self.env.set_observation_rotation_size(Config.OBSERVATION_ROTATION_SIZE) self.env.use_observation_rotation_size(Config.USE_OBSERVATION_ROTATION) self.env.set_cluster_size(Config.CLUSTER_SIZE) self.reset() def reset(self): observation, _, _, _ = self.env.reset() input_laser, rotation = self.process_observation(observation) map = StateMap(input_laser) obs = np.array([ [map.S_image], [rotation] ]) return obs def step(self, action): self._update_display() if action is None: observation, reward, done, info = self.env.step(0, 0, 20) input_laser, rotation = self.process_observation(observation) map = StateMap(input_laser) #obs = np.array([[map.States_map, map.Reward_map], [rotation]]) obs = np.array([[map.S_image], [rotation]]) reward = 0 done = False else: linear, angular = map_action(action) observation, reward, done, info = self.env.step(linear, angular, 20) input_laser, rotation = self.process_observation(observation) map = StateMap(input_laser) obs = np.array([[map.S_image], [rotation]]) return obs, reward, done, info def _update_display(self): if self.visualize: self.env.visualize() def observation_size(self): return self.env.observation_size() def process_observation(self, observation): laser_scan = np.array(observation[:Config.OBSERVATION_SIZE]) oriontaion = np.array(observation[Config.OBSERVATION_SIZE:]) return laser_scan, oriontaion
def main(): sess = tf.Session() K.set_session(sess) env = Environment("test") actor_critic = ActorCritic(env, sess) done = False num_trials = 10000 trial_len = 500 steps = [] state_size = env.observation_size() for trial in range(num_trials): cur_state,_,_,_ = env.reset() cur_state = np.reshape(cur_state, [1,state_size]) for step in range(trial_len): action = actor_critic.act(cur_state) linear, angular = convert_action(action) new_state, reward, done, _ = env.step(linear, angular,10) new_state = np.reshape(new_state, [1, state_size]) actor_critic.remember(cur_state, action, reward, new_state, done) actor_critic.train() cur_state = new_state env.visualize() if done: break
class RewardModifierTest(unittest.TestCase): def setUp(self): json_data = '{"observations": {"screen_features": ["height_map", "player_id", "player_relative", "unit_type"], ' \ '"minimap_features": ["player_id", "selected"], "nonspatial_features": ["player", "score_cumulative"], ' \ '"action_ids": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}, "rewards": [1, 1, 1, 1]}' config = json.loads(json_data) self._reward_mod = RewardModifier(config["rewards"]) self.old_obs = [None] self.zero_obs = [None] # self._obs_spec = {} # self._builder = dummy_observation.Builder(self._obs_spec) # self.obs = self._builder.build() self.env = Environment() # self.obs = observations = [None] * 16 self.obs = self.env.reset() def testModifyZero(self): print("Testing Zero") reward_1 = self._reward_mod.modify(self.obs[0], 0, self.zero_obs[0]) self.assertEqual(reward_1, 0) def testModifyReset(self): print("Testing Reset") reward_1 = self._reward_mod.modify(self.obs[0], 0, self.old_obs[0]) self._old_obs = self.obs self.obs = self.env.reset() reward_2 = self._reward_mod.modify(self.obs[0], 0, self.old_obs[0]) self.assertEqual(reward_1, reward_2) def testModifySelectArmy(self): print("Testing Select Army") reward_1 = self._reward_mod.modify(self.obs[0], 0, self.old_obs[0]) self._old_obs = self.obs new_obs = self.env.step([actions.FunctionCall(_SELECT_ARMY, [_NOT_QUEUED])]) reward_2 = self._reward_mod.modify(new_obs[0], 0, self.old_obs[0]) self.assertEqual(reward_1, reward_2) def testModifyAttack(self): print("Testing Attack") reward_1 = self._reward_mod.modify(self.obs[0], 0, self.old_obs[0]) self._old_obs = self.obs new_obs = self.env.step([actions.FunctionCall(_SELECT_ARMY, [_NOT_QUEUED])]) new_obs = self.env.step([actions.FunctionCall(_ATTACK_MINIMAP, [_QUEUED, [20, 20]])]) reward_2 = self._reward_mod.modify(new_obs[0], 0, self.old_obs[0]) self.assertEqual(reward_1, reward_2)
def learn_flappyb(): env = Environment(draw=DRAW, fps=1, debug=False, dist_to_pipe=DIFFICULTY_LEARN, dist_between_pipes=DIST_BETWEEN_PIPES, obs_this_pipe=OBS_THIS_PIPE_LEARN) writer = None if WRITE: writer = SummaryWriter(comment=NAME) observation_space = env.get_observation_size_buffer() action_space = env.get_action_size() model = load_model('models/dqn/{}.h5'.format(LOAD_NAME)) dqn_solver = DQNSolver(observation_space, action_space, model) run = 0 if SAVE_MODEL: name = '{}-PART={}'.format(NAME, run) dqn_solver.model.save('models/dqn/{}.h5'.format(name)) while True: run += 1 state = env.reset() state = np.reshape(state, [1, observation_space]) step = 0 reward_score = 0 while True: step += 1 action = dqn_solver.act(state, env) state_next, reward, terminal, info = env.step_buffer(action) reward_score += reward state_next = np.reshape(state_next, [1, observation_space]) dqn_solver.remember(state, action, reward, state_next, terminal) state = state_next if terminal: print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(reward_score)) if WRITE: writer.add_scalar("reward", reward_score, run) break dqn_solver.experience_replay() if (run % 100 == 0) and SAVE_MODEL: name = '{}-PART={}'.format(NAME, run) dqn_solver.model.save('models/dqn/{}.h5'.format(name)) if WRITE: writer.close()
class ObservationModifierTest(unittest.TestCase): def setUp(self): json_data = '{"observations": {"screen_features": ["height_map", "player_id", "player_relative", "unit_type"], ' \ '"minimap_features": ["player_id", "selected"], "nonspatial_features": ["player", "score_cumulative"], ' \ '"action_ids": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}, "rewards": [1, 1, 1, 1]}' config = json.loads(json_data) self._obs_mod = ObservationModifier(config["observations"], 32) #self.old_obs = [None] #self._obs_spec = {} #self._builder = dummy_observation.Builder(self._obs_spec) #self.obs = self._builder.build() self.env = Environment() #self.obs = observations = [None] * 16 self.obs = self.env.reset() #def testModify(self): # alt_obs = self._obs_mod.modify(self.obs[0], 0, self.old_obs[0]) # self._old_obs = self.obs # self.obs = self.env.reset() # alt_obs_2 = self._obs_mod.modify(self.obs[0], 0, alt_obs) # self.assert(alt_obs[0], alt_obs_2[0]) def testModifyScreen(self): print("Testing Screen Observations") alt_obs = self._obs_mod.modify(self.obs[0]) self.assertIn("screen_features", alt_obs) def testModifyMinimap(self): print("Testing Minimap Observations") alt_obs = self._obs_mod.modify(self.obs[0]) self.assertIn("minimap_features", alt_obs) def testModifyNonspatial(self): print("Testing Nonspatial Observations") alt_obs = self._obs_mod.modify(self.obs[0]) self.assertIn("nonspatial_features", alt_obs) def testModifyAvailableMask(self): print("Testing Available Mask Observations") alt_obs = self._obs_mod.modify(self.obs[0]) self.assertIn("available_mask", alt_obs) def testModifyAvailableActions(self): print("Testing Available Actions Observations") alt_obs = self._obs_mod.modify(self.obs[0]) self.assertNotIn("available_actions", alt_obs)
class GameManager: def __init__(self, id): self.visualize = False if Config.VISUALIZE and int(id / len(Config.PATH_TO_WORLD)) == 0: self.visualize = True elif Config.PLAY_MODE: self.visualize = True world_name = Config.PATH_TO_WORLD[id % len(Config.PATH_TO_WORLD)] self.env = Environment(world_name) print("Env {} for Agent {} started.".format(world_name, id)) self.env.set_mode(Config.MODE, Config.TERMINATE_AT_END) self.env.set_observation_rotation_size( Config.OBSERVATION_ROTATION_SIZE) self.env.use_observation_rotation_size(Config.USE_OBSERVATION_ROTATION) self.env.set_cluster_size(Config.CLUSTER_SIZE) self.reset() def reset(self): observation, _, _, _ = self.env.reset() return observation def step(self, action): self._update_display() if action is None: observation, reward, done, info = self.env.step(0, 0, 20) reward = 0 done = False else: linear, angular = map_action(action) observation, reward, done, info = self.env.step( linear, angular, 20) return observation, reward, done, info def _update_display(self): if self.visualize: self.env.visualize() def observation_size(self): return self.env.observation_size()
def main(): sess = tf.Session() K.set_session(sess) env = Environment("test") actor_critic = ActorCritic(env, sess) done = False num_trials = 10000 trial_len = 500 steps = [] state_size = env.observation_size() for trial in range(num_trials): reward_sum = 0 cur_state,_,_,_ = env.reset() cur_state = np.reshape(cur_state, [1,state_size]) for step in range(trial_len): action = actor_critic.act(cur_state) action2 = np.argmax(action[0]) linear , angular = convert_action(action2) print("action", action) #linear = action[0][0] #linear = np.array([linear]) #linear = float(linear[0]) #linear = (0.8/math.pi)*math.atan((linear-0.5))+0.45 #2/pi*atan(50*(x-0.5)) print("linear", linear) #angular =action[0][1]# 0.77 #angular = np.array([angular]) #angular = float(angular[0]) #1/pi*atan(15*(x-0.5))+0.5 #angular = (2/math.pi)*math.atan((angular - 0.5)) print("angular", angular) new_state, reward, done, _ = env.step(linear, angular,20) new_state = np.reshape(new_state, [1, state_size]) reward_sum = reward_sum + reward actor_critic.remember(cur_state, action, reward, new_state, done) cur_state = new_state env.visualize() if done: print("Break!") break actor_critic.train()
def main(): #env = gym.make("MountainCar-v0") env = Environment("test") state_size = env.observation_size() gamma = 0.9 epsilon = .95 trials = 1000 trial_len = 500 # updateTargetNetwork = 1000 dqn_agent = DQN(env=env) done = False batch_size = 32 steps = [] for trial in range(trials): reward_sum = 0 cur_state, _, _, _ = env.reset() cur_state = np.reshape(cur_state, [1, state_size]) for step in range(trial_len): action = dqn_agent.act(cur_state) linear, angular = convert_action(action) new_state, reward, done, _ = env.step(linear, angular, 10) # reward = reward if not done else -20 new_state = np.reshape(new_state, [1, state_size]) reward_sum = reward_sum + reward dqn_agent.remember(cur_state, action, reward, new_state, done) #dqn_agent.replay() # internally iterates default (prediction) model dqn_agent.target_train() # iterates target model cur_state = new_state env.visualize() if done: print("episode: {}/{}, score: {}, e: {:.2} time:{}".format( trial, trials, reward_sum, dqn_agent.epsilon, step)) break if len(dqn_agent.memory) > batch_size: dqn_agent.replay()
def play_flappyb(): env = Environment(draw=True, fps=1, debug=True, dist_to_pipe=DIFFICULTY_PLAY, dist_between_pipes=DIST_BETWEEN_PIPES, obs_this_pipe=OBS_THIS_PIPE_PLAY) observation_space = env.get_observation_size_buffer() action_space = env.get_action_size() model = keras.models.load_model('models/dqn/{}.h5'.format(LOAD_NAME)) dqn_solver = DQNSolver(observation_space, action_space, model) for i in range(20): state = env.reset() state = np.reshape(state, [1, observation_space]) is_done = False while not is_done: action = dqn_solver.act_free(state) # action = env.get_action_random() state_next, reward, terminal, info = env.step_buffer(action) is_done = terminal state = np.reshape(state_next, [1, observation_space])
def run(self): global episode env = Environment("test") while episode < EPISODES: state, _, _, _ = env.reset() score = 0 while True: action = self.get_action(state) linear, angular = self.convert_action(action) next_state, reward, done, _ = env.step(linear, angular, 10) next_state = np.reshape(next_state, [1, state_size]) score += reward self.memory(state, action, reward) state = next_state env.visualize() if done: episode += 1 print("episode: ", episode, "/ score : ", score) scores.append(score) self.train_episode(score != 100) break
def main(): # Check if the ROM is given through argv filename = './Super_Mario_Land_World.gb' env = Environment(filename, max_steps=N_STEPS, visualize=VISUALIZE) env.start() agent = A2C_Agent(discount=0.99, epsilon=0.9, learning_rate=1e-3) agent_is_setup = False entropy_term = 0 all_rewards = [] all_lengths = [] average_lengths = [] for episode in range(N_EPOCHS): print("\n ", "=" * 50) print("Epoch {}/{}".format(episode + 1, N_EPOCHS)) env.reset() state = env.obs() log_probs = [] values = [] rewards = [] if not agent_is_setup: agent.setup(env.observation_space, env.action_space, use_model) agent_is_setup = True for steps in range(N_STEPS): # Get action from agent with torch.no_grad(): action, log_prob, entropy, value = agent.get_action(state, TRAINING) value = value.detach().numpy()[0, 0] new_state, reward, done = env.step(action, steps) rewards.append(reward) values.append(value) log_probs.append(log_prob) entropy_term += entropy # Set obs to the new state state = new_state if done or steps == N_STEPS - 1: Qval, _ = agent.model.forward(torch.Tensor(new_state)) Qval = Qval.detach().numpy()[0, 0] all_rewards.append(np.sum(rewards)) all_lengths.append(steps) average_lengths.append(np.mean(all_lengths[-10:])) if episode % 10 == 0: sys.stdout.write("episode: {}, reward: {}, total length: {}, average length: {} \n".format(episode, np.sum( rewards), steps, average_lengths[ -1])) break print("Loss :", agent.train(values, rewards, log_probs, Qval, entropy_term)) if SAVE_MODEL and TRAINING: date = datetime.datetime.now() model_name = str(date.day) + '_' + str(date.month) + '_' + str(date.hour) + '_' + agent.name + '.h5' agent.save_model(model_name) env.stop()
0, target_policy, behaviour_policy, from_file=False, double=True) actions = [] action = 0 actions.append(action) environment = Env(population_size=1000, initial_sick=1, contagion_rate=1000, mortality_rate=0.1) for i in range(10000): reward, discount, next_state = environment.update(action) action = agent.step(reward, discount, next_state) actions.append(action) if not environment.ALIVE: environment = environment.reset(population_size=1000, initial_sick=1, contagion_rate=1, mortality_rate=0.1) agent.save_to_file() # heatmap(agent.q_values) # plot_history(array(environment.reward_history)*-1) print(agent.q_values) print(actions)
state_size = env.observation_size() action_size = action_mapper.ACTION_SIZE agent = DQNAgent(state_size, action_size) # agent.load("./save/cartpole-dqn.h5") done = False batch_size = 48 print("START DQN") for e in range(EPISODES): visualize = (e % 5 == 0) reward_sum = 0 state, _, _, _ = env.reset() state = np.reshape(state, [1, state_size]) for iteration in range(100): action = agent.act(state) linear, angular = action_mapper.map_action(action) next_state, reward, done, _ = env.step(linear, angular, 20) next_state = np.reshape(next_state, [1, state_size]) reward_sum = reward_sum + reward agent.remember(state, action, reward_sum, next_state, done)
def main(): # Check if the ROM is given through argv filename = './Super_Mario_Land_World.gb' env = Environment(filename, max_steps=N_STEPS, visualize=VISUALIZE) env.start() agent = DQN_Agent(discount=0.9, epsilon=0.9, learning_rate=1e-5) avg_loss = None agent_is_setup = False min_epsilon = 0.001 max_epsilon = 0.001 for episode in range(N_EPOCHS): print("\n ", "=" * 50) env.reset() state = torch.Tensor(env.obs()) old_state = state old_old_state = state is_a_released = torch.ones(1) states = [ torch.cat((state, old_state, old_old_state), 0).view(3, 16, 20), is_a_released, env.mario_size ] episode_reward = 0 if not agent_is_setup: agent.setup(env.observation_space, env.action_space, use_model) agent_is_setup = True for steps in range(N_STEPS): # Get action from agent actions = agent.get_action(states, TRAINING) new_state, reward, done = env.step(actions) #env.print_obs(new_state.numpy().astype(int)) if actions[1] == 0: is_a_released = torch.zeros(1) else: is_a_released = torch.ones(1) if steps + 1 == N_STEPS: done = True episode_reward += reward new_states = [ torch.cat((new_state, states[0][0, :, :], states[0][1, :, :]), 0).view(3, 16, 20), is_a_released, env.mario_size ] agent.update_replay_memory(states, actions, reward, new_states, done) # Train the neural network if TRAINING: loss = agent.train(done) if avg_loss is None: avg_loss = loss else: avg_loss = 0.99 * avg_loss + 0.01 * loss else: avg_loss = 0 states = new_states if (steps + 1) % 20 == 0: print("\rAverage loss : {:.5f} --".format(avg_loss), "Episode rewards: {} --".format(episode_reward), "epochs {}/{} --".format(episode, N_EPOCHS), "steps {}/{}".format(steps + 1, N_STEPS), end="") if done: print("\n", env.level_progress_max) break agent.epsilon = max( min_epsilon, min(max_epsilon, 1.0 - math.log10((episode + 1) / 5))) if SAVE_MODEL and TRAINING: date = datetime.datetime.now() model_name = str(date.day) + '_' + str(date.month) + '_' + str( date.hour) + '_' + agent.name + '.h5' agent.save_model(model_name) env.stop()
from environment.action import Action from environment.environment import Environment from utils.constants import NUM_EPISODES env = Environment() for episode in range(0, NUM_EPISODES): state = env.reset() done = False score = 0 while not done: action = Action(0, env.action_space.sample() - 1, env.action_space.sample() - 1) n_state, reward, done, info = env.step(action) score += reward print(f"Episode: {episode} Score: {score}")
def run_ddpg(self): torch.manual_seed(0) torch.cuda.manual_seed(0) np.random.seed(0) # define model agent = DDPGAgent(self.config.agent.ddpg) agent.cuda() env = Environment(self.config.env) env.reset() if self.config.env.model_load_path is not None: agent.load_state_dict(torch.load(self.config.env.model_load_path)) # define data if self.config.env.dataset == 'cifar100': dataset = Cifar100(self.config.env) else: dataset = Cifar10(self.config.env) train_actor_val_loader = DataLoader( dataset.train_actor_val_dataset, batch_size=self.config.env.val_batch_size, shuffle=False, num_workers=self.config.env.workers) test_actor_test_loader = DataLoader( dataset.test_actor_test_dataset, batch_size=self.config.env.val_batch_size, shuffle=False, num_workers=self.config.env.workers) # define log out time_array = time.localtime(time.time()) log_time = time.strftime("%Y_%m_%d_%H_%M_%S", time_array) # name log_dir with paras use_loss_norm = 'use_loss_norm' if self.config.env.features.use_loss_norm else 'no_loss_norm' use_logits = 'use_logits' if self.config.env.features.use_logits else 'no_logits' use_loss_abs = 'use_loss_abs' if self.config.env.features.use_loss_abs else 'no_loss_abs' use_loss_gain = 'use_loss_gain' if self.config.env.learn_lr_gain else 'no_loss_gain' log_dir_name = '_'.join([self.config.env.reward.reward_option, use_loss_norm, use_logits, use_loss_abs, str(self.config.env.reward.filter_loss_rate), str(self.config.agent.ddpg.buffer_size), str(self.config.agent.ddpg.a_learning_rate), str(self.config.agent.ddpg.c_learning_rate), self.config.agent.ddpg.weight_option, use_loss_gain, log_time]) log_dir = os.path.join(self.config.log_root, self.config.env.log_dir, log_dir_name) if os.path.exists(log_dir) is False: os.makedirs(log_dir) save_dir = os.path.join(self.config.log_root, self.config.env.save_dir, log_dir_name) if os.path.exists(save_dir) is False: os.makedirs(save_dir) # parms out paras_path = os.path.join(log_dir, 'paras.yaml') with open(paras_path, "w", encoding='utf-8') as f: yaml.dump(self.config, f) b_out = None for current_episode in range(self.config.env.num_episode): episode_log = 'sampler_episode_' + str(current_episode) if (current_episode or self.config.env.model_load_path is not None) and current_episode % \ self.config.agent.ddpg.test_actor_step == 0: episode_log = 'test_sampler_episode_' + str(current_episode) log_path = os.path.join(log_dir, episode_log) tb_logger = SummaryWriter(log_path) # if self.config.env.baseline.baseline_out: # if b_out is not None: # b_out.close() # b_out = open('baseline_episode_' + str(current_episode), 'w') agent.reset_noise() env.reset() episode_step = 0 pre_buffer = [None, None, None] if current_episode and current_episode % self.config.env.save_interval == 0: save_path = os.path.join(save_dir, 'episode_' + str(current_episode) + '.pth') agent.save_model(save_path) if (current_episode or self.config.env.model_load_path is not None) and current_episode % \ self.config.agent.ddpg.test_actor_step == 0: sampler = EpisodeGivenSampler(dataset.test_actor_train_dataset, self.config.env.num_stages, self.config.env.num_stage_step_test, self.config.env.num_candidates, total_iters=self.config.env.total_iters) test_actor_train_loader = DataLoader( dataset.test_actor_train_dataset, batch_size=self.config.env.num_candidates, shuffle=False, num_workers=self.config.env.workers, drop_last=True, sampler=sampler) episode_done_step = len(test_actor_train_loader) - 1 for i, (input_image, target) in enumerate(test_actor_train_loader): env.adjust_learning_rate_by_stage(episode_step, self.config.env.num_stage_step_test, env.optimizer, self.config.env.lr_stages) if self.config.env.reward.reward_option == 'sub_reference_model': env.adjust_learning_rate_by_stage(episode_step, self.config.env.num_stage_step_test, env.reference_optimizer, self.config.env.lr_stages) input_image = input_image.cuda() target = target.cuda() current_epoch = episode_step // self.config.env.num_step_per_epoch_test if episode_step < self.config.env.num_warmup_step_test: self.warm_up(env, input_image, target, episode_step, current_epoch, test_actor_test_loader, tb_logger, b_out, self.config.env.num_stage_step_test) else: done = (episode_step == episode_done_step) self.test_actor(agent, env, episode_step, input_image, target, test_actor_test_loader, tb_logger, pre_buffer, current_epoch, done) episode_step += 1 else: sampler = EpisodeGivenSampler(dataset.train_actor_train_dataset, self.config.env.num_stages, self.config.env.num_stage_step, self.config.env.num_candidates, total_iters=self.config.env.total_iters) train_actor_train_loader = DataLoader( dataset.train_actor_train_dataset, batch_size=self.config.env.num_candidates, shuffle=False, num_workers=self.config.env.workers, drop_last=True, sampler=sampler) episode_done_step = len(train_actor_train_loader) - 1 for i, (input_image, target) in enumerate(train_actor_train_loader): env.adjust_learning_rate_by_stage(episode_step, self.config.env.num_stage_step, env.optimizer, self.config.env.lr_stages) if self.config.env.reward.reward_option == 'sub_reference_model': env.adjust_learning_rate_by_stage(episode_step, self.config.env.num_stage_step, env.reference_optimizer, self.config.env.lr_stages) input_image = input_image.cuda() target = target.cuda() current_epoch = episode_step // self.config.env.num_step_per_epoch if episode_step < self.config.env.num_warmup_step: self.warm_up(env, input_image, target, episode_step, current_epoch, train_actor_val_loader, tb_logger, b_out, self.config.env.num_stage_step) else: done = episode_step == episode_done_step - 1 self.train_actor(agent, env, episode_step, input_image, target, train_actor_val_loader, tb_logger, pre_buffer, current_epoch, done) episode_step += 1 if b_out is not None: b_out.close()
class WorkerAgent(threading.Thread): def __init__(self, name, graph_ops, update_ops, world_name, use_target, session, saver): super().__init__() self.name = name self.graph_ops = graph_ops self.session = session self.saver = saver self.graph_ops = graph_ops self.update_ops = update_ops self.env = Environment(world_name) self.env.use_observation_rotation_size(use_target) self.env.set_cluster_size(CLUSTER_SIZE) self.state_size = self.env.observation_size() self.action_size = action_mapper.ACTION_SIZE def run(self): global global_episode, global_step print('Thread {} started.'.format(self.name)) local_episodes = 0 accumulated_reward = 0 best_reward = 0 epsilon = INITIAL_EPSILON state_batch = [] reward_batch = [] action_batch = [] period_start_time = time.time() while global_episode <= MAX_EPISODES: self.env.reset() state, _, _, _ = self.env.step(0, 0) state = self.reshape_state(state) episode_step = 0 episode_reward = 0 while True: q_output = self.graph_ops['network']['q_values'].eval( session=self.session, feed_dict={self.graph_ops['network']['input']: [state]}) if random() <= epsilon: action_index = randrange(self.action_size) else: action_index = np.argmax(q_output) a_t = np.zeros([self.action_size]) a_t[action_index] = 1 if epsilon > final_epsilon: epsilon -= (INITIAL_EPSILON - final_epsilon) / anneal_epsilon_timesteps #print("Choosing Action {}".format(action_index)) x1, x2 = action_mapper.map_action(action_index) next_state, reward, term, info = self.env.step(x1, x2, 10) next_state = self.reshape_state(next_state) episode_reward += reward if visualize: self.env.visualize() #print("Reward: {} \n\n".format(reward)) next_q_values = self.graph_ops['target_network'][ 'q_values'].eval( session=self.session, feed_dict={ self.graph_ops['target_network']['input']: [next_state] }) if not term: reward = reward + gamma * np.amax(next_q_values) state_batch.append(state) action_batch.append(a_t) reward_batch.append(reward) if global_step % target_update_timestep == 0: self.session.run(self.update_ops['reset_target_network']) print("Target Net Resetted") # start = time.time() if episode_step % UPDATE_PERIOD == 0 or term: self.session.run(self.update_ops['minimize'], feed_dict={ self.update_ops['y']: reward_batch, self.update_ops['a']: action_batch, self.graph_ops['network']['input']: state_batch }) state_batch = [] action_batch = [] reward_batch = [] # end = time.time() # print('Time for updating: ', end - start) if global_step % CHECKPOINT_PERIOD_TIMESTEPS == 0: self.saver.save(self.session, CHECKPOINT_PATH, global_step=global_step) global_step += 1 state = next_state episode_step += 1 if term: break accumulated_reward += episode_reward best_reward = episode_reward if ( episode_reward > best_reward) else best_reward local_episodes += 1 global_episode += 1 if local_episodes % PRINT_EVERY == 0: period_end_time = time.time() #writer.add_summary(tf.summary.scalar('AVG Reward', accumulated_reward / PRINT_EVERY)) print( "Thread {0:}. Total Episodes {1:}. Reward AVG: {2:.3f}, Best Reward: {3:.3f}, Globalstep: {4:6d}, Epsilon: {5:f}, Time: {6:}" .format(self.name, global_episode, accumulated_reward / PRINT_EVERY, best_reward, global_step, epsilon, period_end_time - period_start_time)) accumulated_reward = 0 best_reward = -99999 period_start_time = time.time() def reshape_state(self, state): return np.reshape(state, [self.state_size, 1])
): # Checking if there are previous training performances saved os.remove(performance_file_path) # Deleting the old train performances if os.path.exists( log): # Checking if there are previous training performances saved os.remove(log) # Deleting the old train performances print(dt.now()) print("stop loss:", stop_loss_value) print("pc: BH") # ********************************************* Looping over all Episodes ***************-****************************** for ep in range(n_episodes - n_prev_iterations): time_start = dt.now() total_revenue = 0 # Counts the total reward for a single episode print("Iteration: " + str(ep + 1) + "/" + str(n_episodes - n_prev_iterations)) env.reset() # Resetting the environment agent.reset() # Resetting the agent mini-batch memory state, reward = env.step( "Hold") # Making a first neutral action for get the first state # ******************************************* Looping over all Instances ******************************************* while not env.done: # Loop until we finish all the instances action = agent.act( state) # The agent choose an action based on the current state next_state, reward = env.step( actions[action] ) # Getting the next state and reward based on the action choose '''with open(log, "a+") as file: file.write(str(actions[action]) + "\n") # Saving the performance on a file if env.stop_loss_triggered: file.write("Stop Loss Triggered!" + "\n") # Saving the stop loss taken on a file
return action def process_state_batch(self, batch): return batch[:, 0, :] env = Environment("Simulation2d/svg/proto_1", 6) env.use_observation_rotation_size(True) env.set_observation_rotation_size(128) env.set_mode(Mode.ALL_RANDOM) processor = ManualProc() states = env.observation_size() actions = action_mapper.ACTION_SIZE if DEBUG: print('states: {0}'.format(states)) print('actions: {0}'.format(actions)) state, reward, done, _ = env.reset() env.render() done = False while not done: value = input("Próxima ação: [0 - 6]: \n") action = int(value) state, reward, done, _ = env.step(action) #print ('reward: {0}'.format(reward)) env.render()
class Worker(object): def __init__(self, name, globalAC): if MULTIPLE_ROOMS: if name == "W_0" or name == "W_1" or name == "W_2": self.env = Environment(ENV_NAME) elif name == "W_3" or name == "W_4" or name == "W_5": self.env = Environment(ENV_NAME_2) else: self.env = Environment(ENV_NAME_3) else: self.env = Environment(ENV_NAME) self.env.set_cluster_size(CLUSTER_SIZE) self.env.set_observation_rotation_size(64) # TODO self.env.use_observation_rotation_size(True) self.name = name self.AC = ACNet(name, globalAC) def convert_action(self, action): angular = 0 linear = 0 if action == 0: angular = 1.0 linear = 0.5 elif action == 1: angular = 0.5 linear = 0.75 elif action == 2: angular = 0.0 linear = 1.0 elif action == 3: angular = -0.5 linear = 0.75 else: angular = -1.0 linear = 0.5 return linear, angular def work(self): global GLOBAL_RUNNING_R, GLOBAL_EP total_step = 1 buffer_s, buffer_a, buffer_r = [], [], [] while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP: s, _, _, _ = self.env.reset() s = np.reshape(s, [1, N_S]) ep_r = 0 # rnn_state = SESS.run(self.AC.init_state) # zero rnn state at beginning # keep_state = deepcopy(rnn_state) # keep rnn state for updating global net for ep_t in range(MAX_EP_STEP): # a, rnn_state_ = self.AC.choose_action(s, rnn_state) # get the action and next rnn state a = self.AC.choose_action( s) # get the action and next rnn state b = np.asarray(a) b = b[0][0] action = np.argmax(b) linear, angular = self.convert_action(action) s_, r, done, _ = self.env.step(linear, angular, SKIP_LRF) s_ = np.reshape(s_, [1, N_S]) # if (self.name == 'W_0' or self.name == "W_3") and VISUALIZE: if (self.name == 'W_0') and VISUALIZE: self.env.visualize() done = True if ep_t == MAX_EP_STEP - 1 else done ep_r += r buffer_s.append(s) buffer_a.append(b) buffer_r.append(r) # buffer_r.append((r+8)/8) # normalize if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net if done: v_s_ = 0 # terminal else: # v_s_ = SESS.run(self.AC.v, {self.AC.s: s_, self.AC.init_state: rnn_state_})[0, 0] v_s_ = SESS.run(self.AC.v, {self.AC.s: s_})[0, 0] buffer_v_target = [] for r in buffer_r[::-1]: # reverse buffer r v_s_ = r + GAMMA * v_s_ buffer_v_target.append(v_s_) buffer_v_target.reverse() buffer_s, buffer_a, buffer_v_target = np.vstack( buffer_s), np.vstack(buffer_a), np.vstack( buffer_v_target) feed_dict = { self.AC.s: buffer_s, self.AC.a_his: buffer_a, self.AC.v_target: buffer_v_target, # self.AC.init_state: keep_state, } self.AC.update_global(feed_dict) buffer_s, buffer_a, buffer_r = [], [], [] self.AC.pull_global() # keep_state = deepcopy(rnn_state_) # replace the keep_state as the new initial rnn state_ s = s_ # rnn_state = rnn_state_ # renew rnn state total_step += 1 if done: if len(GLOBAL_RUNNING_R ) == 0: # record running episode reward GLOBAL_RUNNING_R.append(ep_r) else: GLOBAL_RUNNING_R.append(0.9 * GLOBAL_RUNNING_R[-1] + 0.1 * ep_r) if self.name == "W_0": print(self.name, "Ep:", GLOBAL_EP, "Ep_r:", ep_r) # print( # self.name, # "Ep:", GLOBAL_EP, # "| Ep_r: %i" % GLOBAL_RUNNING_R[-1], # ) GLOBAL_EP += 1 if GLOBAL_EP % SAVE_INTERVAL == 0: print("Versuche zu Speichern...") self.AC.save_global() print("...gespeichert!") break
class Worker(object): def __init__(self, name, globalAC): self.env = Environment(ENV_NAME) self.env.set_cluster_size(CLUSTER_SIZE) self.name = name self.AC = ACNet(name, globalAC) def convert_action(self, action): angular = 0 linear = 0 if action == 0: angular = 1.0 linear = 0.5 elif action == 1: angular = 0.5 linear = 0.75 elif action == 2: angular = 0.0 linear = 1.0 elif action == 3: angular = -0.5 linear = 0.75 else: angular = -1.0 linear = 0.5 return linear, angular def work(self): global GLOBAL_RUNNING_R, GLOBAL_EP total_step = 1 buffer_s, buffer_a, buffer_r = [], [], [] while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP: s, _, _, _ = self.env.reset() s = np.reshape(s, [1, N_S]) ep_r = 0 rnn_state = SESS.run( self.AC.init_state) # zero rnn state at beginning keep_state = rnn_state.copy( ) # keep rnn state for updating global net for ep_t in range(MAX_EP_STEP): if self.name == 'W_0': self.env.visualize() a, rnn_state_ = self.AC.choose_action( s, rnn_state) # get the action and next rnn state action = np.argmax(a) linear, angular = self.convert_action(action) s_, r, done, _ = self.env.step( linear, angular, 10) # Die Zahl heißt: überspringe so viele Laserscanns s_ = np.reshape(s_, [1, N_S]) done = True if ep_t == MAX_EP_STEP - 1 else done ep_r += r buffer_s.append(s) buffer_a.append(a) buffer_r.append(r) # buffer_r.append((r+8)/8) # normalize if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net if done: v_s_ = 0 # terminal else: v_s_ = SESS.run(self.AC.v, { self.AC.s: s_, self.AC.init_state: rnn_state_ })[0, 0] buffer_v_target = [] for r in buffer_r[::-1]: # reverse buffer r v_s_ = r + GAMMA * v_s_ buffer_v_target.append(v_s_) buffer_v_target.reverse() buffer_s, buffer_a, buffer_v_target = np.vstack( buffer_s), np.vstack(buffer_a), np.vstack( buffer_v_target) feed_dict = { self.AC.s: buffer_s, self.AC.a_his: buffer_a, self.AC.v_target: buffer_v_target, self.AC.init_state: keep_state, } self.AC.update_global(feed_dict) buffer_s, buffer_a, buffer_r = [], [], [] self.AC.pull_global() keep_state = rnn_state_.copy( ) # replace the keep_state as the new initial rnn state_ s = s_ rnn_state = rnn_state_ # renew rnn state total_step += 1 if self.name == 'W_0': self.env.visualize() if done: if len(GLOBAL_RUNNING_R ) == 0: # record running episode reward GLOBAL_RUNNING_R.append(ep_r) else: GLOBAL_RUNNING_R.append(0.9 * GLOBAL_RUNNING_R[-1] + 0.1 * ep_r) print( self.name, "Ep:", GLOBAL_EP, "| Ep_r: %i" % GLOBAL_RUNNING_R[-1], ) GLOBAL_EP += 1 break
def main(test=False, chkpt=None, device='cuda'): """ main is used to start and preform the training in non-render mode :param test: Not required :param chkpt: Not required :param device: string (cuda or cpu) :return: None """ # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") if not test: wandb.init(project="MultiSection Continum", name="Reaching Task 32 Per Layer") robot = Robot() robot.newSection() robot.newSection() env = Environment(robot) if test: # env.staticPoint([-9.966711079379195, 99.3346653975306]) env.render() # else: # env.staticPoint([-9.966711079379195, 99.3346653975306]) lastObs = env.getObservation() rb = ReplayBuffer() memorySize = 500000 minRBSize = 20000 sampleSize = 750 envStepsBeforeTrain = 250 targetModelUpdate = 500 epsMin = 0.01 epsDecay = 0.99999 model = Model(len(lastObs.state), len(env.robot.actions)).to(device) if chkpt != None: model.load_state_dict(torch.load(chkpt)) targetModel = Model(len(lastObs.state), len(env.robot.actions)).to(device) updateTGTModel(model, targetModel) stepSinceTrain = 0 # stepSinceTrain keeps track of the number of steps since the last main network training # in this case main network updates after every envStepsBeforeTrain stepSinceTGTUpdate = 0 # stepSinceTGTUpdate keeps track of the number of steps since the last target network update (ie transfering main network weights) # in this case the target network updates after every targetModelUpdate stepNum = -1 * minRBSize episodeRewards = [] rollingReward = 0 # Copying over the weights tq = tqdm() # Work in progress while True: if test: env.render() time.sleep(0.05) tq.update(1) eps = epsDecay**(stepNum / 10) if test: eps = 0 if random() < eps: # print("Taking random action") action = env.robot.randomAction() else: actNum = model(torch.tensor( lastObs.state).to(device)).max(-1)[-1].item() action = env.robot.actions[actNum] obs = env.robotStep(action[0], action[1]) rollingReward = obs.reward # print(obs) # # env.render() # x = model(torch.Tensor(obs.state)) # # print(x) # episodeRewards.append(rollingReward) # # if stepSinceTGTUpdate > targetModelUpdate: # # if env.done(): # episodeRewards.append(rollingReward) # if test: # print(rollingReward) # print(episodeRewards) # rollingReward = 0 # # env.reset() if env.done(): env.reset() # env.staticPoint([-9.966711079379195, 99.3346653975306]) # obs.reward = obs.reward / 100 stepSinceTrain += 1 stepNum += 1 rb.insert(obs) if ( not test ) and rb.index >= minRBSize and stepSinceTrain > envStepsBeforeTrain: stepSinceTGTUpdate += 1 loss = trainStep(rb.sample(sampleSize), model, targetModel, len(env.robot.actions), device) wandb.log( { "Loss": loss.detach().cpu().item(), "eps": eps, "Step Rewards:": np.mean(episodeRewards) }, step=stepNum) stepSinceTrain = 0 if stepSinceTGTUpdate > targetModelUpdate: print("Updating Target Model") updateTGTModel(model, targetModel) stepSinceTGTUpdate = 0 torch.save( targetModel.state_dict(), f"/u/meharabd/research/CRLMachineLearningProject/Models/{stepNum}.pth" ) episodeRewards = []