def main(args): env = gym.make('CartPole-v0') dim_state = env.observation_space.shape[0] dim_action = env.action_space.n actor = Actor(dim_state, args.dim_hidden, dim_action) critic = Critic(dim_state, args.dim_hidden) agent = ActorCriticAgent(env=env, actor=actor, critic=critic, lr=args.lr, gamma=args.gamma, render=args.render) scores = 0 history = [] for i in range(args.n_episodes): scores += agent.run_episode() if (i + 1) % args.print_interval == 0: print( f"[Episode {i+1}] Avg Score: {scores / args.print_interval:.3f}" ) history.append(scores / args.print_interval) scores = 0.0 plot_result(history, args.print_interval)
# size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) action_size = brain.vector_action_space_size # size of each action print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format( states.shape[0], state_size)) # Create the agent to train with the parameters to use agent = ActorCriticAgent(state_size=state_size, action_size=action_size, seed=0) # Run the training scores_mean_agent, score_mean_last100 = ddpg(env, agent, num_agents, brain_name, n_episodes=200, save_checkpoint=True, simu_name='single_train') # plot the scores fig = plt.figure() ax = fig.add_subplot(111) plt.plot(np.arange(len(scores_mean_agent)), scores_mean_agent)
def train(shared_model: torch.nn.Module, directory: str, hyperparams: HyperParams, frame_counter: torch.multiprocessing.Value, optimizer: torch.optim.Optimizer, monitor_queue: Queue, process_number: int): """ trains an a3c agent on an openai gym environment. """ torch.manual_seed(process_number) # make environment atari = True if hyperparams.feature_type == 'cnn' else False monitor = process_number == 0 env = create_environment(hyperparams.env_name, directory, atari=atari, monitor=monitor) env.seed(process_number) state = env.reset() state = torch.from_numpy(state) done = False episode_reward = 0 episode_length = 0 episode_values = [] episode_start_time = time.time() hidden_state = (torch.zeros(1, 256), torch.zeros(1, 256)) # make agent model = ActorCritic(env.observation_space.shape, env.action_space.n, hyperparams.feature_type) agent = ActorCriticAgent(model, shared_model) # training loop while frame_counter.value < hyperparams.max_timesteps: # load weights from shared model model.load_state_dict(shared_model.state_dict()) # reset batch batch = [] # run environment to get batch for _ in range(hyperparams.batch_size): action, value, log_prob, entropy, hidden_state = agent.act( state, hidden_state) state, reward, done, _ = env.step(action) episode_reward += reward episode_length += 1 episode_values.append(value.item()) batch.append( TimestepInfo(value=value, log_prob=log_prob, reward=reward, entropy=entropy)) if done: state = env.reset() hidden_state = (torch.zeros(1, 256), torch.zeros(1, 256)) state = torch.from_numpy(state) if done: now = time.time() episode_data = EpisodeData( score=episode_reward, length=episode_length, average_value=np.mean(episode_values), time_taken=now - episode_start_time) monitor_queue.put(episode_data) with frame_counter.get_lock(): frame_counter.value += episode_length episode_reward = 0 episode_length = 0 episode_values = [] episode_start_time = now break # Get value of final timestep values = [x.value for x in batch] if done: values.append(torch.Tensor([0.])) else: _, value, _ = model(state, hidden_state) values.append(value) # reflect on batch critic_loss = 0 actor_loss = 0 gae = torch.Tensor([0]) real_value = values[-1] # if -1 in [x.reward for x in batch]: # import ipdb; ipdb.set_trace() for i in reversed(range(len(batch))): real_value = (hyperparams.discount_factor * real_value + batch[i].reward) advantage = real_value - values[i] critic_loss = critic_loss + 0.5 * advantage.pow(2) value_delta = (batch[i].reward + hyperparams.discount_factor * values[i + 1].data - values[i].data) gae = (gae * hyperparams.discount_factor * hyperparams.gae + value_delta) actor_loss = (actor_loss - batch[i].log_prob * torch.Tensor([gae]) - hyperparams.entropy_coef * batch[i].entropy) optimizer.zero_grad() loss = (critic_loss * hyperparams.critic_coef + actor_loss * hyperparams.actor_coef) loss.backward() # Clip gradients torch.nn.utils.clip_grad_norm_(model.parameters(), 50) # Share gradients for param, shared_param in zip(model.parameters(), shared_model.parameters()): if shared_param.grad is not None: break shared_param._grad = param.grad optimizer.step() hidden_state = (hidden_state[0].data, hidden_state[1].data)
def run(train, n_episodes, log_dir, render=False): ## init env = AtariPong(gamma=0.999, seed=1) obs = env.initial_observation() agent = ActorCriticAgent(env.n_actions(), initial_observation=obs) step_idx = 0 # an episode consists of n>=1 steps episode_idx = 0 # an "episode" refers to a "rally" in Pong game_idx = 0 # a game consists of n>=1 episodes discounted_returns = [ 0 ] * n_episodes # from the start state of every episode ## bookkeeper per game because training is done at then of a game if train == True: training_data = {'obss': [], 'rewards': [], 'labels': []} ## main loop while (episode_idx < n_episodes): ## msg print('episode_idx= '+str(episode_idx)+ \ ' @step_idx= '+str(step_idx)+ \ ' @game_idx= '+str(game_idx)) if render: env.render() time.sleep(1 / 60.0) ## step! action, label = agent.act(obs) obs, reward, info = env.step(action) discounted_returns[episode_idx] += ((env.gamma**step_idx) * reward) ## collect data for training if train == True: training_data['obss'].append(obs) training_data['rewards'].append(reward) training_data['labels'].append(label) ## close an episode(== a rally) if info['end_of_episode']: print('episode_idx= '+str(episode_idx)+ \ ': ended with G= '+str('%.3f'%discounted_returns[episode_idx])) episode_idx += 1 step_idx = 0 if info['end_of_game'] or (episode_idx == n_episodes): ## train if train == True: print('training...') ## finalize training data for k in training_data.keys(): training_data[k] = np.vstack(training_data[k]) training_data['returns'] = env.compute_returns( training_data['rewards']) ## train! agent.train(training_data) ## reset training data training_data = {'obss': [], 'rewards': [], 'labels': []} ## set for the next game obs = env.initial_observation() game_idx += 1 else: step_idx += 1 ## closure env.close() if train == True: print('discounted_returns for the last 10 training episodes:') print(str(discounted_returns[-10:]))
def train(train_env, vocab_size, n_iters, log_every=1000, val_envs={}): ''' Train on training set, validating on both seen and unseen. ''' agent = ActorCriticAgent(train_env, vocab_size, "", batch_size, max_episode_len) data_log = defaultdict(list) start = time.time() guide_prob = 0.7 for idx in range(0, n_iters, log_every): interval = min(log_every, n_iters - idx) iter = idx + interval agent.train(interval, guide_prob) train_losses = np.array(agent.losses) train_loss_avg = np.average(train_losses) data_log['train loss'].append(train_loss_avg) loss_str = '' #'guide prob: %.2f' % guide_prob #loss_str += ', train loss: %.4f' % train_loss_avg # Run validation for env_name, (env, evaluator) in val_envs.iteritems(): agent.env = env agent.results_path = '%s%s_%s_iter_%d.json' % ( RESULT_DIR, model_prefix, env_name, iter) agent.test(0.0) #guide_prob) #val_losses = np.array(agent.losses) #val_loss_avg = np.average(val_losses) #data_log['%s loss' % env_name].append(val_loss_avg) agent.write_results() score_summary, _ = evaluator.score(agent.results_path) #loss_str += ', %s loss: %.4f' % (env_name, val_loss_avg) loss_str += ', %s' % (env_name) for metric, val in score_summary.iteritems(): data_log['%s %s' % (env_name, metric)].append(val) if metric in ['success_rate']: loss_str += ' success: %.2f' % (val) agent.env = train_env print('%s (%d %d%%) %s' % (timeSince(start, float(iter) / n_iters), iter, float(iter) / n_iters * 100, loss_str)) guide_prob -= 0.01 guide_prob = max(guide_prob, 0.0)
gamma=args.gamma, batch_size=args.batch_size, replay_memory_size=args.replay_memory_size, hidden_size=args.hidden_size, model_input_size=env.observation_space.shape[0], use_PER=args.use_PER, use_ICM=args.use_ICM) trainQ(a, env, args.MAX_NUMBER_OF_STEPS, args.EPISODES_TO_TRAIN, args.START_RENDERING, args.update_frequency) else: if not args.use_ICM: a = ActorCriticAgent( continuous=False, nb_actions=env.action_space.n, learning_rate=args.learning_rate, gamma=args.gamma, hidden_size=args.hidden_size, model_input_size=env.observation_space.shape[0], entropy_coeff_start=args.entropy_coefficient_start, entropy_coeff_end=args.entropy_coefficient_end, entropy_coeff_anneal=args.entropy_anneal) trainActor(a, env, args.MAX_NUMBER_OF_STEPS, args.EPISODES_TO_TRAIN, args.START_RENDERING) else: a = ActorCriticAgentUsingICM( continuous=False, nb_actions=env.action_space.n, learning_rate=args.learning_rate, gamma=args.gamma, hidden_size=args.hidden_size, model_input_size=env.observation_space.shape[0], entropy_coeff_start=args.entropy_coefficient_start,
def main(): #env = gym.make('InvertedPendulum-v1') env = gym.make('Pendulum-v0') action_dim = env.action_space.shape[0] state_dim = env.observation_space.shape[0] agent = ActorCriticAgent(state_dim, action_dim) state = env.reset() timestep_limit = env.spec.timestep_limit start_time = t.time() #timestep_limit = min(env.spec.timestep_limit, 20) # For checking purposes; make it proper for run # Initial data build up done_flag = 0 for i in range(REPLAY_MEMORY): if (done_flag == True): state = env.reset() action = env.action_space.sample() next_state, reward, done_flag, info = env.step(action) agent.append_memory(state, action, reward, next_state, done_flag) state = next_state print "Initial memory built!!" # Initial Training for a few steps for _ in range(5): agent.update_network() print "Initial network performance = ", policy_evaluation(agent, env, 5) # ================================================================================= print "******** Starting learning process *************" num_episodes = 2 update_freq = 1 # update after how many steps (within each episode) print_freq = 1 # how often to print (episodes) performance = np.zeros(num_episodes) best_ep = 0 best_agent = copy.deepcopy(agent) for ep in range(num_episodes): done_flag = 0 state = env.reset() time = 0 while (done_flag != True and time <= timestep_limit): action_pred = np.array( agent.actor_net.predict(state.reshape(1, -1))) action_pred = action_pred[0] next_state, reward, done_flag, _ = env.step(action_pred) agent.append_memory(state, action_pred, reward, next_state, done_flag) state = next_state #print time, timestep_limit if (time % update_freq == 0): agent.update_network() time += 1 performance[ep] = policy_evaluation(agent, env, 2) if (ep % print_freq == 0): print "Now in episode: ", ep, " of ", num_episodes print "Agent performance = ", performance[ep] if (performance[ep] > performance[best_ep]): best_agent = copy.deepcopy(agent) best_ep = ep end_time = t.time() print "Total time", (end_time - start_time) plt.plot(performance[-100:]) plt.show() inspect_performance(agent, env) # Save agent to file with open('objs.pickle', 'wb') as f: pickle.dump([best_agent, performance], f)
def main(): agent = ActorCriticAgent('agent') env = PaintEnv('env')
def main(): #env = gym.make('InvertedPendulum-v1') env = gym.make('Pendulum-v0') action_dim = env.action_space.shape[0] state_dim = env.observation_space.shape[0] agent = ActorCriticAgent(state_dim, action_dim) state = env.reset() timestep_limit = env.spec.timestep_limit start_time = t.time() #timestep_limit = min(env.spec.timestep_limit, 20) # For checking purposes; make it proper for run # Initial data build up done_flag = 0 for i in range(REPLAY_MEMORY): if (done_flag == True): state = env.reset() action = env.action_space.sample() next_state, reward, done_flag, info = env.step(action) agent.append_memory(state, action, reward, next_state, done_flag) state = next_state print "Initial memory built!!" # Initial Training for a few steps for _ in range(5): agent.update_network() print "Initial network performance = ", policy_evaluation(agent, env, 5) # ================================================================================= print "******** Starting learning process *************" num_episodes = 2 update_freq = 1 # update after how many steps (within each episode) print_freq = 1 # how often to print (episodes) performance = np.zeros(num_episodes) best_ep = 0 best_agent = copy.deepcopy(agent) for ep in range(num_episodes): done_flag = 0 state = env.reset() time = 0 while (done_flag!=True and time<=timestep_limit): action_pred = np.array(agent.actor_net.predict(state.reshape(1,-1))) action_pred = action_pred[0] next_state, reward, done_flag, _ = env.step(action_pred) agent.append_memory(state, action_pred, reward, next_state, done_flag) state = next_state #print time, timestep_limit if (time % update_freq == 0): agent.update_network() time += 1 performance[ep] = policy_evaluation(agent, env, 2) if (ep % print_freq == 0): print "Now in episode: ", ep, " of ", num_episodes print "Agent performance = ", performance[ep] if (performance[ep] > performance[best_ep]): best_agent = copy.deepcopy(agent) best_ep = ep end_time = t.time() print "Total time", (end_time - start_time) plt.plot(performance[-100:]) plt.show() inspect_performance(agent, env) # Save agent to file with open('objs.pickle', 'wb') as f: pickle.dump([best_agent, performance], f)
import gym import math from matplotlib import pyplot as plt ''' -tensorboard -checkpointing -worker frame collecting -batch ppo ''' if __name__ == '__main__': # make agent # inputShape = (10, 240, 256) agent = ActorCriticAgent(alpha=0.001, inputChannels=1, gamma=0.99, numActions=7) # make env from nes_py.wrappers import JoypadSpace import gym_super_mario_bros from gym_super_mario_bros.actions import SIMPLE_MOVEMENT env = gym_super_mario_bros.make('SuperMarioBros-v1') # env = gym_super_mario_bros.make('SuperMarioBros-2-1-v1') env = JoypadSpace(env, SIMPLE_MOVEMENT) scoreHistory = [] numHiddenEpisodes = -1 highScore = -math.inf recordTimeSteps = math.inf
def main(): config = read_config("config.yaml") agent_config = config['Agent'] network_config = agent_config['Network'] training_config = config['Training'] files_config = config['Files'] eval_config = config['Evaluation'] print('\t\t --------------------------------------------') print('\t\t ------ Parameters of the experiment ------') print('\t\t --------------------------------------------\n') print('## Agent params') print('Agent : ' + agent_config['name']) print('Gamma : ', agent_config['gamma']) print('') print('## Network Params') print('Network used : ' + network_config['name']) print('Number of filters : ', network_config['n_filters']) print('activation function : ' + network_config['activation']) print('state embedding size : ', network_config['state_embedding_size']) print('') print('## Training params') print('Number of iteration : ', training_config['n_iter']) print('Learning rate : ', network_config['lr']) print('Number of games per iteration : ', training_config['n_games']) print('Number of workers : ', training_config['n_workers']) print('Batch size : ', training_config['batch_size']) print('Buffer size : ', training_config['buffer_size']) print('') print('## Evaluation params') print('Number of games per iteration : ', eval_config['n_games']) print('Number of workers : ', eval_config['n_workers']) print('') sleep(2.0) # Init files and tensorboard model_name = agent_config['name'] checkpoints_dir = os.path.join(model_name, files_config['checkpoints_dir']) tensorboard_log_dir = os.path.join(model_name, files_config['tensorboard_log_dir']) results_log_path = os.path.join(model_name, files_config['results_log_path']) # fix random seed if config['Seed'] is None: np.random.seed(seed=42) else: np.random.seed(int(seed)) print('\n\n') env = Env() # if train from scratch if training_config["init_checkpoint"] == 0: # initialize dir for tensorboard flush_or_create(tensorboard_log_dir) # initialize dir for checkpoitns flush_or_create(checkpoints_dir) # init agent and network from scratch agent = ActorCriticAgent(agent_config, network_config, checkpoints_dir, tensorboard_log_dir) # initialize iteration number start = 0 # else restart training from last checkpoint else: agent = ActorCriticAgent(agent_config, network_config, checkpoints_dir, tensorboard_log_dir, restore=True) print('\nnetwork restored from checkpoint # ', latest_checkpoint) print('') start = latest_checkpoint # intialize the summary writer and results log file log_file = open(results_log_path, "wb+") # open log file to write in during evaluation display_every = training_config["display_every"] n_games_train = training_config["n_games"] n_workers_train = training_config["n_workers"] T_update_net = training_config["T_update_net"] T_update_target_net = training_config["T_update_target_net"] n_games_eval = eval_config["n_games"] n_workers_eval = eval_config["n_workers"] prefill_buffer = training_config["prefill_buffer"] # gamma = agent_config['gamma'] summary_dict = dict({}) data_buffer = Buffer(capacity=training_config['buffer_size']) logger = logging.getLogger(__name__) if prefill_buffer: # populate buffer with intial data from random games print('\nPopulating Buffer ... \n') populate_buffer(agent, n_workers_train, data_buffer) print('\n\n') print('Starting training\n\n') batch_size = training_config['batch_size'] for it in tqdm(np.arange(start, training_config["n_iter"]), desc="parallel gameplay iterations"): # play games to generate data and train the network env.reset() try: agent.train(env, n_games_train, data_buffer, batch_size, n_workers_train, display_every, T_update_net) except Exception as error: print('\n\n#### AN ERROR OCCURED WHILE TRAINING ####\n\n') agent.net.summary_writer.close() agent.net.sess.close() log_file.close() logger.error(error) raise agent.net.save_checkpoint(checkpoints_dir, it=it + 1) # play games with latest checkpoint and track average final reward results = agent.evaluate(env, n_games_eval, n_workers_eval) # save results pickle.dump(results, log_file) print('') agent.net.summary_writer.close() agent.net.sess.close() log_file.close() print('End of training')
import numpy as np from agent import ActorCriticAgent from gridworld import GridWorld env = GridWorld() agent = ActorCriticAgent(savepath='saved_models', load=False) for i in np.arange(1, 10001): obs, end, reward = env.reset() while end == 0: # obs = np.zeros((5,5,1)) # obs = np.concatenate((obs, np.ones_like(obs)*0/200), axis=-1) # obs[0,0,0] = 1 action, predicted_value = agent.act(obs, reward) # print(action, predicted_value) # quit() obs, end, reward = env.step(action) obs = np.zeros((5, 5, 2)) # obs = np.concatenate((obs, np.zeros_like(obs)), axis=-1) # obs = np.concatenate((obs, np.ones_like(obs)*0/200), axis=-1) obs[1, 0, 0] = 1 obs[1, 1, 1] = 1 obs[2, 2, 1] = 1 action, predicted_value = agent.act(obs, reward=None) print(predicted_value) # quit() agent.episode_end(end, reward) print('Total Reward: {}'.format(env.total_reward))