def __init__(self, env_name, global_actor_critic, lock, eps=0.5, anneal_rate=0.99, t_max=10, gamma=0.99, lr=0.0005): Thread.__init__(self) self.env = gym.make(env_name) self.env_name = env_name self.state_dim = self.env.observation_space.shape[0] self.action_dim = self.env.action_space.n self.lock = lock self.eps = eps self.anneal_rate = anneal_rate self.global_actor_critic = global_actor_critic self.actor_critic = ActorCritic(self.state_dim, self.action_dim) self.t_max = t_max self.gamma = gamma self.optimizer = tf.keras.optimizers.Adam(lr) log_dir = 'logs' self.summary_writer = tf.summary.create_file_writer(log_dir)
def __init__(self): self.run_epochs = 0 self.epochs_total = 0 self.hybrid_loss_cumulative = [] self.critic_loss_cumulative = [] self.critic_target_loss_cumulative = [] self.actor_loss_cumulative = [] self.scores_cumulative = [] self.critic_scores_cumulative = [] self.actor_scores_cumulative = [] self.winratio_cumulative = [] self.epsilon_cumulative = [] self.epsilon = 0.9 self.last_lr_change = 0 e = Map(self.grid_size[0], self.grid_size[1]) e.USE_MAZE = self.use_maze e.curriculum = self.curriculum # distance from goal player spawns at most self.environment = e self.action_count = e.action_space.n self.action_shape = (self.action_count, ) self.buffer = ReplayBuffer(self.buffer_size) num_rewards = len(e.hybrid_rewards()) self.actor_critic = ActorCritic(self.input_shape, self.action_shape, num_rewards) self.actor_critic_target = ActorCritic(self.input_shape, self.action_shape, num_rewards) self.possible_actions = np.eye(e.action_space.n)[np.arange( e.action_space.n)]
def __init__(self, env, GAMMA=0.5): self.env = env self.states_dim = self.env.observation_space.shape[0] self.action_dim = self.env.action_space.shape[0] self.actor_critic = ActorCritic(self.states_dim, self.action_dim, lr=0.0000000001) self.all_observations = np.asarray([])
def __init__(self, env, GOAL_STATE, GAMMA=0.95, lr=0.001): self.env = env self.GOAL_STATE = GOAL_STATE self.states_dim = self.env.observation_space.shape[0] self.action_dim = self.env.action_space.shape[0] self.actor_critic = ActorCritic( self.states_dim, self.action_dim, GAMMA=GAMMA, lr=lr) self.min_spread_holder = MinSpreadHolder(self.states_dim)
def learn(logger, episodes, render): env = Env() actor = ActorCritic(env, DISCOUNT) lr_policy = LearningRate(MAX_LEARNING_RATE_POLICY, MIN_LEARNING_RATE_POLICY, episodes) lr_value = LearningRate(MAX_LEARNING_RATE_VALUE, MIN_LEARNING_RATE_VALUE, episodes) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() scores = [] for episode in range(episodes): lrp = lr_policy.get_lr(episode) lrv = lr_value.get_lr(episode) score = learn_episode(env, actor, lrp, lrv, render, sess) scores.append(score) mean = np.mean(scores[-STEPS_TO_WIN:]) args = [episode, episodes, score, mean, lrp, lrv] logger.info('After {}/{} episodes: {}, mean: {:.2f}, lrp: {:.6f}, lrv: {:.6f}'.format(*args)) save_path = saver.save(sess, os.path.join(os.getcwd(), 'model.ckpt')) logger.info('Model saved to {}'.format(save_path))
def __init__(self, state_dim, action_bound=1.0, final_activation=tf.identity, training_batch_size=32, GAMMA=0.95, lr=0.001, replay_buffer_size=1024): self.AC = ActorCritic(state_dim, state_dim, final_activation=final_activation, action_bound=action_bound, training_batch_size=training_batch_size, GAMMA=GAMMA, lr=lr, replay_buffer_size=replay_buffer_size)
class GoalController(object): def __init__(self, state_dim, action_bound=1.0, final_activation=tf.identity, training_batch_size=32, GAMMA=0.95, lr=0.001, replay_buffer_size=1024): self.AC = ActorCritic(state_dim, state_dim, final_activation=final_activation, action_bound=action_bound, training_batch_size=training_batch_size, GAMMA=GAMMA, lr=lr, replay_buffer_size=replay_buffer_size) def add_to_replay_buffer(self, state, goal_state, reward, resulting_state): # Here, reward means exactly what it sounds like it does... self.AC.add_to_replay_buffer(state, goal_state, reward, resulting_state) def add_batch_to_replay_buffer(self, states, goal_states, rewards, resulting_states): for s, gs, r, rs in zip(states, goal_states, rewards, resulting_states): self.AC.add_to_replay_buffer(s, gs, r, rs) def train_from_replay_buffer(self): self.AC.train_from_replay_buffer() def get_goal_state(self, current_states): return self.AC.get_actions(current_states)
def pd_test(env_fn, policy, load_path): env = env_fn() actions = env.unwrapped.action_list env._seed(int(time.time())) obs = env.reset() obs = np.expand_dims(obs, axis=0) action_list = [] with tf.Session() as sess: actor_critic = ActorCritic(sess, policy, env.observation_space.shape, env.action_space, 1, 5) if load_path: actor_critic.load(load_path) else: sess.run(tf.global_variables_initializer()) print('WARNING: No Model Loaded!') print(env.unwrapped.scramble_current) d = False while not d: print('-------------------------------------------------') print('Current Observation') env.render() a, v, neg = actor_critic.act(obs, stochastic=True) print('') print('action: ', actions[a[0]]) print('value: ', v) print('neglogp: ', neg) print('pd: ') for ac, pd in zip(actions, actor_critic.step_model.logits(obs)[0][0]): print('\t', ac, pd) obs, r, d, _ = env.step(a[0]) print('r: ', r) obs = np.expand_dims(obs, axis=0) env.render() env.close()
def setup_agents(self): agents = [] for i in range(self.n_agents): model = ActorCritic(n_agents=self.n_agents, state_size=self.state_size, action_size=self.action_size, seed=self.random_seed) agents.append(DDPG(i, model, self.action_size, self.random_seed)) return agents
def __init__(self, device, state_dim, action_dim, action_std, lr, betas, gamma, K_epochs, eps_clip): self.lr = lr self.device = device self.betas = betas self.gamma = gamma self.eps_clip = eps_clip self.K_epochs = K_epochs self.policy = ActorCritic(state_dim, action_dim, action_std).to(device) #self.optimizer = RAdam(self.policy.parameters(), lr=lr, betas=betas) self.optimizer = optim.Adam(self.policy.parameters(), lr=lr) self.policy_old = ActorCritic(state_dim, action_dim, action_std).to(device) self.policy_old.load_state_dict(self.policy.state_dict()) self.MseLoss = nn.MSELoss()
def __init__(self, state_dim, action_dim, action_bound=0.4, training_batch_size=32, GAMMA=0.95, lr=0.001, replay_buffer_size=1024): self.state_dim = state_dim self.AC = ActorCritic(new_state_dim, action_dim, action_bound=action_bound, training_batch_size=training_batch_size, GAMMA=GAMMA, lr=lr, replay_buffer_size=replay_buffer_size)
def run_tests(): """ Runs tests from .yaml file, saves results plots and .csv file. Args: None. Returns: results: Test results dataframe. """ with open(FILENAME) as file: # Loads the test hyper-parameters as dictionaries. tests = yaml.safe_load(file) # create a dataframe to keep the results test_dict = tests['Tests'] results = pd.DataFrame(test_dict) results["Episode"] = "" results['Max average score'] = "" for i, test in enumerate(tests['Tests']): env = gym.make(test['env']) env.reset() actor_critic = ActorCritic(env, test['episodes'], test['max_score'], test['hidden_size'], test['gamma'], test['save']) ## run training best_score, episode, rew_hist = actor_critic.train() results.loc[i, 'Episode'] = episode results.loc[i, 'Max average score'] = best_score plot_graphs(test, rew_hist) # save results to csv file filename = 'results/' + 'test_table.csv' results.to_csv(filename) return results
def __init__(self, state_dim, action_dim, eps=0.2, gamma=0.99, lambda_=0.95, K_epoch=80, batch_size=64): super(PPO, self).__init__() self.eps = eps self.gamma = gamma self.lambda_ = lambda_ self.K_epoch = K_epoch self.batch_size = batch_size self.model = ActorCritic(state_dim, action_dim) self.model_old = ActorCritic(state_dim, action_dim) for param in self.model_old.parameters(): param.requires_grad = False self.copy_weights()
def _create_actor_critic(self, is_target=False): name = 'target_actor_critic' if is_target else 'actor_critic' log_tensorboard = False if is_target else True actor_critic = ActorCritic(name, self._args, self.env_info, self.action_size, reuse=self.reuse, log_tensorboard=log_tensorboard, is_target=is_target) return actor_critic
def __init__(self, parameters): self.parameters = parameters self.env = gym.make(self.parameters['env']) self.nA = self.env.action_space.sample().shape[0] self.state_size = self.env.reset().shape[0] # Build our replay memory self.memory = Memory(replay_size=self.parameters['replay_size'], action_size=self.nA, state_size=self.state_size, batch_size=self.parameters['batch_size']) # Create actor and critic self.actor_critic = ActorCritic( actor_lr=parameters['actor_learning_rate'], critic_lr=parameters['critic_learning_rate'], gamma=parameters['gamma'], state_size=self.state_size, action_size=self.nA, tau=parameters['tau'])
def main(): human_model = ActorCritic() human_model.load_state_dict(torch.load('ac_para.pkl')) env = gym.make('CartPole-v1') model = AskActorCritic() print_interval = 20 score = 0.0 for n_epi in range(10000): done = False s = env.reset() step,ask_step = 0,0 while not done: for t in range(n_rollout): prob = model.pi(torch.from_numpy(s).float()) m = Categorical(prob) a = m.sample().item() if a == 2: # human action prob = human_model.pi(torch.from_numpy(s).float()) m = Categorical(prob) a = m.sample().item() model.put_human_data((s, a)) ask_step += 1 s_prime, r, done, info = env.step(a) model.put_data((s,a,r,s_prime,done)) s = s_prime score += r step += 1 if done: break model.train_net() if n_epi%print_interval==0 and n_epi!=0: print("# of episode :{}, avg score : {:.1f}, ask rate : {:.2f}".format(n_epi, score/print_interval, ask_step/step)) score = 0.0
def main(): pixels = ( (0.0, 1.0, 1.0), (0.0, 1.0, 0.0), (0.0, 0.0, 1.0), (1.0, 1.0, 1.0), (1.0, 1.0, 0.0), (0.0, 0.0, 0.0), (1.0, 0.0, 0.0), ) pixel_to_categorical = {pix: i for i, pix in enumerate(pixels)} num_pixels = len(pixels) #For each mode in MiniPacman there are different rewards mode_rewards = { "regular": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], "avoid": [0.1, -0.1, -5, -10, -20], "hunt": [0, 1, 10, -20], "ambush": [0, -0.1, 10, -20], "rush": [0, -0.1, 9.9] } reward_to_categorical = { mode: {reward: i for i, reward in enumerate(mode_rewards[mode])} for mode in mode_rewards.keys() } mode = "regular" num_envs = 16 def make_env(): def _thunk(): env = MiniPacman(mode, 1000) return env return _thunk envs = [make_env() for i in range(num_envs)] envs = SubprocVecEnv(envs) state_shape = envs.observation_space.shape num_actions = envs.action_space.n env_model = EnvModel(envs.observation_space.shape, num_pixels, len(mode_rewards["regular"])) actor_critic = ActorCritic(envs.observation_space.shape, envs.action_space.n) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(env_model.parameters())
class StateController(object): def __init__(self, state_dim, action_dim, action_bound=0.4, training_batch_size=32, GAMMA=0.95, lr=0.001, replay_buffer_size=1024): new_state_dim = 2 * state_dim self.state_dim = state_dim self.AC = ActorCritic( new_state_dim, action_dim, action_bound=action_bound, training_batch_size=training_batch_size, GAMMA=GAMMA, lr=lr, replay_buffer_size=replay_buffer_size) def get_reward(self, resulting_state, goal_state): return np.sum(((resulting_state - goal_state)**2), 1) def add_to_replay_buffer(self, state, goal_state, action, resulting_state): combined_state = np.concatenate( state, goal_state) #combined is state plus goal reward = self.get_reward(resulting_state, goal_state) # But reward is result - goal real_resulting_state = np.concatenate(resulting_state, goal_state) self.AC.add_to_replay_buffer(combined_state, action, reward, real_resulting_state) def add_batch_to_replay_buffer(self, states, goal_states, actions, resulting_states): for s, gs, a, rs in zip(states, goal_states, actions, rewards, resulting_states): self.AC.add_to_replay_buffer(s, gs, a, rs) def train_from_replay_buffer(self): self.AC.train_from_replay_buffer() def get_actions(self, states, goal_states): combined_states = np.concatenate((states, goal_states), 1) return self.AC.get_actions(combined_states) def get_random_visited_state(self): return self.AC.get_batch(1)[0][0][0:self.state_dim]
def create_network(self): # for actor network self.o_stats = Normalizer(size=self.dimo, eps=self.norm_eps, default_clip_range=self.norm_clip) if self.use_goal: self.g_stats = Normalizer(size=self.dimg, eps=self.norm_eps, default_clip_range=self.norm_clip) else: self.g_stats = None self.main = ActorCritic(self.o_stats, self.g_stats, self.input_dims, self.use_goal).to(self.device) self.target = ActorCritic(self.o_stats, self.g_stats, self.input_dims, self.use_goal).to(self.device) self.target.actor = copy.deepcopy(self.main.actor) self.target.critic = copy.deepcopy(self.main.critic) self.actor_optimizer = optim.Adam(self.main.actor.parameters(), lr=self.pi_lr) self.critic_optimizer = optim.Adam(self.main.critic.parameters(), lr=self.Q_lr)
def __init__(self, env_id, input_shape, n_actions, icm, n_threads=8): names = [str(i) for i in range(1, n_threads + 1)] global_actor_critic = ActorCritic(input_shape, n_actions) global_actor_critic.share_memory() global_optim = SharedAdam(global_actor_critic.parameters()) if not icm: global_icm = None global_icm_optim = None else: global_icm = ICM(input_shape, n_actions) global_icm.share_memory() global_icm_optim = SharedAdam(global_icm.parameters()) self.ps = [ mp.Process(target=worker, args=(name, input_shape, n_actions, global_actor_critic, global_icm, global_optim, global_icm_optim, env_id, n_threads, icm)) for name in names ] [p.start() for p in self.ps] [p.join() for p in self.ps]
def go(resolution): env = Env() actor = ActorCritic(env, DISCOUNT) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess, 'model.ckpt') ps = np.linspace(-1, 1, num=resolution) vs = np.linspace(-1, 1, num=resolution) states = [] for v in vs: for p in ps: states.append([p, v]) states = np.reshape(np.array(states), [resolution * resolution, 2]) values = actor.get_value(states, sess) values = np.reshape(values, [resolution, resolution]) plt.imshow(values, origin='lower') plt.title('Value(position, velocity)') minx, maxx = env.low[0], env.high[0] xticks = map(lambda x: '{:.2f}'.format(x), np.linspace(minx, maxx, num=10)) plt.xticks(np.linspace(0, resolution, num=10), xticks) plt.xlabel('Position') miny, maxy = env.low[1], env.high[1] plt.ylabel('Velocity') yticks = map(lambda y: '{:.2f}'.format(y), np.linspace(miny, maxy, num=10)) plt.yticks(np.linspace(0, resolution, num=10), yticks) plt.show()
def go(logger, render): env = Env() actor = ActorCritic(env, DISCOUNT) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess, 'model.ckpt') logger.info('Model restored') steps = [] for episode in range(STEPS_TO_WIN): steps.append(play_episode(env, actor, render, sess)) logger.info('Mean score over {} episodes: {:.2f}'.format(STEPS_TO_WIN, np.mean(steps)))
def __init__(self, ids, env, session, global_optimizer, global_max_timesteps, state_size, action_size): self.ids = ids self.env = env self.agent_name = "agent_id_" + str(ids) self.agent_scope = "agent_id_" + str(ids) self.tf_session = session self.state_size = state_size self.action_size = action_size self.local_agent = ActorCritic( self.state_size, self.action_size, self.agent_scope, global_optimizer) #create the AC network self.initial_local_ops = self.swap_tf_ops( 'global', self.agent_scope) # get global ops to reset local agents self.global_max_timesteps = global_max_timesteps self.MAX_TIMESTEP_PER_EPISODE = 500 self.buffer_length = 10 # len of the buffer self.gamma = 0.999
def __init__(self, GUI): self.GUI = GUI self.env = self.new_env() # Модель маятника из OpenAI Gym self._S_LEN = self.env.observation_space.shape[0] self._A_BOUND = self.env.action_space.high # Используемые модели self.learning_models_list = [ QLearningModel(state_len=self._S_LEN, action_len=1, a_bound=self._A_BOUND), QLearningModel2(state_len=self._S_LEN, action_len=1, a_bound=self._A_BOUND), ActorCritic(state_len=self._S_LEN, action_len=1, a_bound=self._A_BOUND), ActorCriticDDPG(state_len=self._S_LEN, action_len=1, a_bound=self._A_BOUND) ] self.learning_model = self.learning_models_list[ 0] # Ссылка на применяемую модель self.s = np.zeros(self._S_LEN) # Предыдущее состояние маятника self.reset_env() # Перезапустить маятник self.is_learning = True # Производить обучение self.working = False # В данный момент производится управление маятником self.endless = False # Не перезапускать маятник по завершении эпизода self.max_ep_steps = 200 # Число шагов в эпизоде self.steps = 0 # Число совершённых шагов
def train_model(config, gpu_id, save_dir, exp_name): env = gym.make(config['env_name']) env.seed(1234) torch.manual_seed(1234) np.random.seed(1234) actor = MLP(len(env.observation_space.sample()), config['hidden_layers'], env.action_space.n, "distribution", "relu", "standard", name="ActorNetwork", verbose=True) critic = MLP(len(env.observation_space.sample()), config['hidden_layers'], 1, "real_values", "relu", "standard", name="CriticNetwork", verbose=True) agent = ActorCritic(actor, critic, config['gamma'], lr_critic=1e-3, lr_actor=1e-5, decay_critic=0.9, decay_actor=0.9, use_cuda=config['use_cuda'], gpu_id=gpu_id) """ if config['resume']: agent.load_policy(directory=os.path.join(save_dir, exp_name)) """ # TRAINING LOOP episode_number = 0 running_average = None loss_tape, episode_lengths = [], [] while episode_number < config['max_episodes']: # Book Keeping episode_number += 1 observation = env.reset() reward_list = [] agent.set_state(observation) done = False t = 0 # RUN ONE EPISODE while not (done) and t < config['max_steps']: action = agent.select_action(observation) observation, reward, done, _ = env.step(action) if config['env_name'] == "MountainCar-v0": done = bool(observation[0] >= 0.5) if config['render']: env.render() if episode_number in config['video_ckpt']: image = env.render(mode='rgb_array') video_folder = os.path.join( save_dir, exp_name, "video_ckpts".format(episode_number)) if not os.path.exists(video_folder): os.makedirs(video_folder) plt.imsave( os.path.join(video_folder, "ep{}_{}.png".format(episode_number, t)), image) # UPDATE THE PARAMETERS (for Temporal-Difference method) agent.compute_gradients(action) agent.update_parameters(observation, reward) reward_list.append(reward) agent.set_state(observation) t += 1 # More book-keeping episode_lengths.append(len(reward_list)) if running_average is None: running_average = np.sum(reward_list) else: running_average = running_average * 0.9 + np.sum(reward_list) * 0.1 print("Episode: {}, reward: {}, average: {:.2f}".format( episode_number, np.sum(reward_list), running_average)) if episode_number % config['chkp_freq'] == 0: #agent.save_policy(directory=os.path.join(save_dir, exp_name)) utils.save_results_classicControl(save_dir, exp_name, episode_lengths, config) env.close()
def make_env(): def _thunk(): env = MiniPacman(mode, 1000) return env return _thunk envs = [make_env() for i in range(num_envs)] envs = SubprocVecEnv(envs) state_shape = envs.observation_space.shape num_actions = envs.action_space.n env_model = EnvModel(envs.observation_space.shape, envs.action_space.n, num_pixels, len(mode_rewards["regular"])) actor_critic = ActorCritic(envs.observation_space.shape, envs.action_space.n) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(env_model.parameters()) env_model = env_model.to(DEVICE) actor_critic = actor_critic.to(DEVICE) checkpoint = torch.load(os.path.join(ACTOR_CRITIC_PATH, "actor_critic_checkpoint")) actor_critic.load_state_dict(checkpoint['actor_critic_state_dict']) reward_coef = 0.1 num_updates = args.epoch losses = []
class DDPG(): def __init__(self, parameters): self.parameters = parameters self.env = gym.make(self.parameters['env']) self.nA = self.env.action_space.sample().shape[0] self.state_size = self.env.reset().shape[0] # Build our replay memory self.memory = Memory(replay_size=self.parameters['replay_size'], action_size=self.nA, state_size=self.state_size, batch_size=self.parameters['batch_size']) # Create actor and critic self.actor_critic = ActorCritic( actor_lr=parameters['actor_learning_rate'], critic_lr=parameters['critic_learning_rate'], gamma=parameters['gamma'], state_size=self.state_size, action_size=self.nA, tau=parameters['tau']) def train(self): config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = True # Create global step and increment operation global_step_tensor = tf.Variable(0, trainable=False, name='global_step') increment_global_step = tf.assign_add(global_step_tensor, 1) # Create model saver saver = tf.train.Saver() sess = tf.Session(config=config) if not self.parameters['restore']: sess.run(tf.global_variables_initializer()) else: saver.restore(sess, tf.train.latest_checkpoint('./saves')) self.actor_critic.set_moving_to_target(sess) run_id = np.random.randint(10000) trainwriter = tf.summary.FileWriter(logdir='./logs/' + str(run_id), graph=sess.graph) # Get action noise action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(self.nA), sigma=float(self.parameters['sigma']) * np.ones(self.nA)) # Fill Replay Memory state = self.env.reset() fill_amount = 0 while fill_amount < self.parameters['replay_init_size']: action = self.env.action_space.sample() next_state, reward, done, _ = self.env.step(action) if done: state = self.env.reset() else: fill_amount += 1 self.memory.add(state, action, reward, done, next_state) state = next_state # Main Loop steps = 0 for i in range(self.parameters['num_epochs']): avg_epoch_rewards = 0 num_epochs = 1 for e in range(self.parameters['num_episodes']): state = self.env.reset() ep_reward = 0 # Perform rollout while True: noise = action_noise() action = self.actor_critic.pi(sess, state[None, ...]) action += noise action = np.clip(action, self.env.action_space.low[0], self.env.action_space.high[0]) assert action.shape == self.env.action_space.shape """ # UNCOMMENT TO PRINT ACTIONS a0 = tf.Summary(value=[tf.Summary.Value(tag="action_0", simple_value=action[0,0])]) trainwriter.add_summary(a0,steps) a1 = tf.Summary(value=[tf.Summary.Value(tag="action_1", simple_value=action[0,1])]) trainwriter.add_summary(a1,steps) a2 = tf.Summary(value=[tf.Summary.Value(tag="action_2", simple_value=action[0,2])]) trainwriter.add_summary(a2,steps) steps += 1 """ next_state, reward, done, _ = self.env.step(action) self.memory.add(state, action, reward, done, next_state) if self.parameters['render_train']: self.env.render() ep_reward += reward if done: reward_summary = tf.Summary(value=[ tf.Summary.Value(tag="ep_rewards", simple_value=ep_reward) ]) trainwriter.add_summary( reward_summary, i * self.parameters['num_episodes'] + e) action_noise.reset() break state = next_state avg_epoch_rewards = avg_epoch_rewards + ( ep_reward - avg_epoch_rewards) / num_epochs num_epochs += 1 # Perform train for t in range(self.parameters['num_train_steps']): s_state, s_action, s_reward, s_next_state, s_terminal = self.memory.sample( ) # Train actor critic model self.actor_critic.update(sess=sess, filewriter=trainwriter, state_batch=s_state, next_state_batch=s_next_state, action_batch=s_action, reward_batch=s_reward, done_batch=s_terminal) sess.run(increment_global_step) # Print out epoch stats here table_data = [['Epoch', 'Average Reward'], [ str(i) + "/" + str(self.parameters['num_epochs']), str(avg_epoch_rewards) ]] table = AsciiTable(table_data, "Training Run: " + str(run_id)) save_path = saver.save(sess, "./saves/model.ckpt") os.system('clear') print("Model saved in path: %s" % save_path + "\n" + table.table) def test(self): config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = True saver = tf.train.Saver() sess = tf.Session(config=config) saver.restore(sess, tf.train.latest_checkpoint('./saves')) while True: state = self.env.reset() # Perform rollout while True: action = self.actor_critic.pi(sess, state[None, ...]) action = np.clip(action, self.env.action_space.low[0], self.env.action_space.high[0]) assert action.shape == self.env.action_space.shape next_state, reward, done, _ = self.env.step(action) self.env.render() if done: break state = next_state
class DDPG(): def __init__(self, parameters): self.parameters = parameters self.env = gym.make( self.parameters['env'][:self.parameters['env'].find('_')]) self.nA = self.env.action_space.sample().shape[0] self.state_size = self.env.reset().shape[0] # Build our replay memory self.memory = Memory(replay_size=self.parameters['replay_size'], action_size=self.nA, state_size=self.state_size, batch_size=self.parameters['batch_size']) # Create actor and critic self.actor_critic = ActorCritic( actor_lr=parameters['actor_learning_rate'], critic_lr=parameters['critic_learning_rate'], gamma=parameters['gamma'], state_size=self.state_size, action_size=self.nA, tau=parameters['tau']) def train(self): config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = True # Create global step and increment operation global_step_tensor = tf.Variable(0, trainable=False, name='global_step') increment_global_step = tf.assign_add(global_step_tensor, 1) # Create model saver saver = tf.train.Saver(max_to_keep=None) sess = tf.Session(config=config) if not self.parameters['restore']: sess.run(tf.global_variables_initializer()) else: saver.restore(sess, tf.train.latest_checkpoint('./saves')) self.actor_critic.set_moving_to_target(sess) run_id = np.random.randint(10000) trainwriter = tf.summary.FileWriter(logdir='./logs/' + str(run_id), graph=sess.graph) # Get action noise action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(self.nA), sigma=float(self.parameters['sigma']) * np.ones(self.nA)) # Fill Replay Memory state = self.env.reset() fill_amount = 0 while fill_amount < self.parameters['replay_init_size']: action = self.env.action_space.sample() next_state, reward, done, _ = self.env.step(action) if done: state = self.env.reset() else: fill_amount += 1 self.memory.add(state, action, reward, done, next_state) state = next_state # Main Loop plots = {'critic_loss': [], 'actor_loss': [], 'episode_reward': []} plots_dir = './plots/' weights_dir = './weights/' graph_dir = './graph/' if not os.path.exists(plots_dir): os.makedirs(plots_dir) if not os.path.exists(weights_dir): os.makedirs(weights_dir) if not os.path.exists(graph_dir): os.makedirs(graph_dir) saver.export_meta_graph(graph_dir + self.parameters['env'] + '/graph.meta') #cumulative step counter cumu_step = 0 for i in range(self.parameters['num_epochs']): avg_epoch_rewards = 0 n_epochs = 1 for e in range(self.parameters['num_episodes']): state = self.env.reset() ep_reward = 0 ep_n_action = 0 # Perform rollout for _ in range(500): noise = action_noise() action = self.actor_critic.pi(sess, state[None, ...]) action += noise action = np.clip(action, self.env.action_space.low[0], self.env.action_space.high[0]) assert action.shape == self.env.action_space.shape next_state, reward, done, _ = self.env.step(action) # print(action) # print(next_state) # print(reward) self.memory.add(state, action, reward, done, next_state) if self.parameters['render_train']: self.env.render() ep_reward += reward ep_n_action += 1 cumu_step += 1 state = next_state # Perform train avg_critic_loss = 0.0 avg_actor_loss = 0.0 for t in range(self.parameters['num_train_steps']): s_state, s_action, s_reward, s_next_state, s_terminal = self.memory.sample( ) # Train actor critic model _, _, critic_loss, actor_loss = self.actor_critic.update( sess=sess, filewriter=trainwriter, state_batch=s_state, next_state_batch=s_next_state, action_batch=s_action, reward_batch=s_reward, done_batch=s_terminal) avg_critic_loss += critic_loss avg_actor_loss += actor_loss sess.run(increment_global_step) avg_critic_loss /= self.parameters['num_train_steps'] avg_actor_loss /= self.parameters['num_train_steps'] if done: reward_summary = tf.Summary(value=[ tf.Summary.Value(tag="ep_rewards", simple_value=ep_reward) ]) trainwriter.add_summary( reward_summary, i * self.parameters['num_episodes'] + e) action_noise.reset() break avg_epoch_rewards = avg_epoch_rewards + ( ep_reward - avg_epoch_rewards) / n_epochs n_epochs += 1 print('Epoch: {:d} | Reward: {:d} | Avg_Q_loss: {:.4f} | Avg_a_loss: {:.4f} | Episode: {:d} | Step: {:d} | Cumu Step: {:d}'\ .format(i+1, int(ep_reward), avg_critic_loss, avg_actor_loss, e+1, ep_n_action, cumu_step)) if e % 19 == 0: save_path = saver.save( sess, weights_dir + self.parameters['env'] + '/model.ckpt', global_step=i * e + 1) plots['episode_reward'].append(ep_reward) plots['critic_loss'].append(critic_loss) plots['actor_loss'].append(critic_loss) pickle.dump( plots, open(plots_dir + self.parameters['env'] + '_plot.pickle', 'wb')) def test(self): config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = True saver = tf.train.Saver() sess = tf.Session(config=config) saver.restore( sess, tf.train.latest_checkpoint( './weights/HalfCheetah-v2_kirkiles_train50episode_noise_norm_bufsize1Mi1k' )) while True: state = self.env.reset() # Perform rollout while True: action = self.actor_critic.pi(sess, state[None, ...]) action = np.clip(action, self.env.action_space.low[0], self.env.action_space.high[0]) #print(action) assert action.shape == self.env.action_space.shape next_state, reward, done, _ = self.env.step(action) self.env.render() if done: break state = next_state
class Runner(object): def __init__(self, env, GOAL_STATE, GAMMA=0.95, lr=0.001): self.env = env self.GOAL_STATE = GOAL_STATE self.states_dim = self.env.observation_space.shape[0] self.action_dim = self.env.action_space.shape[0] self.actor_critic = ActorCritic( self.states_dim, self.action_dim, GAMMA=GAMMA, lr=lr) self.min_spread_holder = MinSpreadHolder(self.states_dim) def render_if_true(self, render): if render: self.env.render() def get_reward(self, state): shifted_goal_state = self.shift_observation(self.GOAL_STATE) diff = state - shifted_goal_state reward = -1 * np.mean(np.multiply(diff, diff)) return reward def add_observed_batch(self, obs_batch): self.min_spread_holder.add_batch(obs_batch) def shift_observation(self, obs): return self.min_spread_holder.transform(obs) def play_random_game(self, render=True, add_to_all_observations=False): env = self.env observation = env.reset() games_observations = [] for t in range(1000): games_observations.append(observation) self.render_if_true(render) action = env.action_space.sample() observation, reward, done, info = env.step(action) if done: if add_to_all_observations: self.add_observed_batch(np.asarray(games_observations)) print('Episode finished after {} timesteps'.format(t + 1)) break def play_game_from_actor_with_random(self, render=True, add_to_buffer=True, prob_random=0.0): games_observations = [] env = self.env obs = env.reset() games_observations = [] for t in range(1000): self.render_if_true(render) obs = np.asarray(obs) games_observations.append(obs) shifted_obs = self.shift_observation(obs) action = self.actor_critic.get_actions( np.asarray([shifted_obs]))[0] # I think zero. if not render and (random.random() < prob_random): action = env.action_space.sample() # if not render: # for i in range(len(action)): # if random.random() < prob_random: # action[i] = (random.random() * 0.8) - 0.4 new_obs, reward, done, info = env.step(action) shifted_new_obs = self.shift_observation(new_obs) if add_to_buffer: # real_reward = 0.0 if not done else -1.0 real_reward = self.get_reward( shifted_new_obs) if not done else -2.0 self.actor_critic.add_to_replay_buffer( shifted_obs, action, real_reward, shifted_new_obs) if done: self.add_observed_batch(np.asarray(games_observations)) print('Episode finished after {} timesteps'.format(t + 1)) break obs = new_obs def train_from_replay_buffer(self, should_print): losses = self.actor_critic.train_from_replay_buffer(should_print) return np.mean(losses)
class Runner(object): def __init__(self, env, GAMMA=0.5): self.env = env self.states_dim = self.env.observation_space.shape[0] self.action_dim = self.env.action_space.shape[0] self.actor_critic = ActorCritic(self.states_dim, self.action_dim, lr=0.0000000001) self.all_observations = np.asarray([]) def get_means_stddevs(self, num_games=100, min_std_dev=0.01): observations = [] env = self.env for i in xrange(num_games): obs = env.reset() while True: observations.append(obs) action = env.action_space.sample() obs, reward, done, info = env.step(action) if done: print('game {} done'.format(i)) break observations = np.asarray(observations) mean = np.mean(observations, axis=0) stddev = np.maximum(np.std(observations, axis=0), min_std_dev) return mean, stddev def write_mean_stddev_to_file(self, num_games=100, min_std_dev=0.01): mean, stddev = self.get_means_stddevs(num_games, min_std_dev) with open('./mujoco_data/mean_state.json', 'w') as f: f.write(json.dumps(mean.tolist())) with open('./mujoco_data/stddev_state.json', 'w') as f: f.write(json.dumps(stddev.tolist())) print('written') def get_min_spread(self, num_games=100, min_spread=0.05): observations = [] env = self.env for i in xrange(num_games): obs = env.reset() while True: observations.append(obs) action = env.action_space.sample() obs, reward, done, info = env.step(action) if done: print('game {} done'.format(i)) break observations = np.asarray(observations) min_obs = observations.min(axis=0) max_obs = observations.max(axis=0) spread = np.maximum(max_obs - min_obs, min_spread) return min_obs, spread def write_min_spread_to_file(self, num_games=100, min_spread=0.05): min_obs, spread = self.get_min_spread(num_games, min_spread) print(min_obs) print(spread) print(min_obs.shape, spread.shape) with open('./mujoco_data/min_state.json', 'w') as f: f.write(json.dumps(min_obs.tolist())) with open('./mujoco_data/spread_state.json', 'w') as f: f.write(json.dumps(spread.tolist())) print('written') def play_random_game(self, render=True): env = self.env observation = env.reset() for t in range(1000): if render == True: env.render() action = env.action_space.sample() observation, reward, done, info = env.step(action) if done: print('Episode finished after {} timesteps'.format(t + 1)) break def play_game_from_actor(self, render=True, add_to_buffer=True): env = self.env obs = env.reset() for t in range(1000): if render == True: env.render() sleep(0.05) obs = np.asarray(obs) shifted_obs = shift_state(obs) action = self.actor_critic.get_actions(np.asarray( [shifted_obs]))[0] # I think zero. new_obs, reward, done, info = env.step(action) if done: print('Episode finished after {} timesteps'.format(t + 1)) break if add_to_buffer: shifted_new_obs = shift_state(new_obs) # real_reward = get_reward(shifted_obs, shifted_new_obs) real_reward = get_reward(shifted_new_obs) self.actor_critic.add_to_replay_buffer(shifted_obs, action, real_reward, shifted_new_obs) obs = new_obs def play_game_from_actor_with_random(self, render=True, add_to_buffer=True, prob_random=0.05): env = self.env obs = env.reset() for t in range(1000): if render == True: env.render() sleep(0.01) obs = np.asarray(obs) shifted_obs = shift_state(obs) action = self.actor_critic.get_actions(np.asarray( [shifted_obs]))[0] # I think zero. if not render: for i in range(len(action)): if random.random() < prob_random: action[i] = (random.random() * 0.8) - 0.4 # random_move = random.random() < prob_random # if random_move and not render: # print('Random move!') # action = env.action_space.sample() # else: # action = self.actor_critic.get_actions( # np.asarray([shifted_obs]))[0] # I think zero. new_obs, reward, done, info = env.step(action) if done: print obs, '\n' print new_obs, '\n' print shifted_obs, '\n' exit() if add_to_buffer: real_reward = -0.10 self.actor_critic.add_to_replay_buffer( shifted_obs, action, real_reward, shifted_obs) print('Episode finished after {} timesteps'.format(t + 1)) break if add_to_buffer: shifted_new_obs = shift_state(new_obs) # real_reward = get_reward(shifted_obs, shifted_new_obs) real_reward = get_reward(shifted_new_obs) self.actor_critic.add_to_replay_buffer(shifted_obs, action, real_reward, new_obs) obs = new_obs def train_from_replay_buffer(self, should_print): losses = self.actor_critic.train_from_replay_buffer(should_print) return np.mean(losses)