def train(train_env, vocab_size, n_iters, log_every=1000, val_envs={}): ''' Train on training set, validating on both seen and unseen. ''' agent = ActorCriticAgent(train_env, vocab_size, "", batch_size, max_episode_len) data_log = defaultdict(list) start = time.time() guide_prob = 0.7 for idx in range(0, n_iters, log_every): interval = min(log_every, n_iters - idx) iter = idx + interval agent.train(interval, guide_prob) train_losses = np.array(agent.losses) train_loss_avg = np.average(train_losses) data_log['train loss'].append(train_loss_avg) loss_str = '' #'guide prob: %.2f' % guide_prob #loss_str += ', train loss: %.4f' % train_loss_avg # Run validation for env_name, (env, evaluator) in val_envs.iteritems(): agent.env = env agent.results_path = '%s%s_%s_iter_%d.json' % ( RESULT_DIR, model_prefix, env_name, iter) agent.test(0.0) #guide_prob) #val_losses = np.array(agent.losses) #val_loss_avg = np.average(val_losses) #data_log['%s loss' % env_name].append(val_loss_avg) agent.write_results() score_summary, _ = evaluator.score(agent.results_path) #loss_str += ', %s loss: %.4f' % (env_name, val_loss_avg) loss_str += ', %s' % (env_name) for metric, val in score_summary.iteritems(): data_log['%s %s' % (env_name, metric)].append(val) if metric in ['success_rate']: loss_str += ' success: %.2f' % (val) agent.env = train_env print('%s (%d %d%%) %s' % (timeSince(start, float(iter) / n_iters), iter, float(iter) / n_iters * 100, loss_str)) guide_prob -= 0.01 guide_prob = max(guide_prob, 0.0)
def run(train, n_episodes, log_dir, render=False): ## init env = AtariPong(gamma=0.999, seed=1) obs = env.initial_observation() agent = ActorCriticAgent(env.n_actions(), initial_observation=obs) step_idx = 0 # an episode consists of n>=1 steps episode_idx = 0 # an "episode" refers to a "rally" in Pong game_idx = 0 # a game consists of n>=1 episodes discounted_returns = [ 0 ] * n_episodes # from the start state of every episode ## bookkeeper per game because training is done at then of a game if train == True: training_data = {'obss': [], 'rewards': [], 'labels': []} ## main loop while (episode_idx < n_episodes): ## msg print('episode_idx= '+str(episode_idx)+ \ ' @step_idx= '+str(step_idx)+ \ ' @game_idx= '+str(game_idx)) if render: env.render() time.sleep(1 / 60.0) ## step! action, label = agent.act(obs) obs, reward, info = env.step(action) discounted_returns[episode_idx] += ((env.gamma**step_idx) * reward) ## collect data for training if train == True: training_data['obss'].append(obs) training_data['rewards'].append(reward) training_data['labels'].append(label) ## close an episode(== a rally) if info['end_of_episode']: print('episode_idx= '+str(episode_idx)+ \ ': ended with G= '+str('%.3f'%discounted_returns[episode_idx])) episode_idx += 1 step_idx = 0 if info['end_of_game'] or (episode_idx == n_episodes): ## train if train == True: print('training...') ## finalize training data for k in training_data.keys(): training_data[k] = np.vstack(training_data[k]) training_data['returns'] = env.compute_returns( training_data['rewards']) ## train! agent.train(training_data) ## reset training data training_data = {'obss': [], 'rewards': [], 'labels': []} ## set for the next game obs = env.initial_observation() game_idx += 1 else: step_idx += 1 ## closure env.close() if train == True: print('discounted_returns for the last 10 training episodes:') print(str(discounted_returns[-10:]))
def main(): config = read_config("config.yaml") agent_config = config['Agent'] network_config = agent_config['Network'] training_config = config['Training'] files_config = config['Files'] eval_config = config['Evaluation'] print('\t\t --------------------------------------------') print('\t\t ------ Parameters of the experiment ------') print('\t\t --------------------------------------------\n') print('## Agent params') print('Agent : ' + agent_config['name']) print('Gamma : ', agent_config['gamma']) print('') print('## Network Params') print('Network used : ' + network_config['name']) print('Number of filters : ', network_config['n_filters']) print('activation function : ' + network_config['activation']) print('state embedding size : ', network_config['state_embedding_size']) print('') print('## Training params') print('Number of iteration : ', training_config['n_iter']) print('Learning rate : ', network_config['lr']) print('Number of games per iteration : ', training_config['n_games']) print('Number of workers : ', training_config['n_workers']) print('Batch size : ', training_config['batch_size']) print('Buffer size : ', training_config['buffer_size']) print('') print('## Evaluation params') print('Number of games per iteration : ', eval_config['n_games']) print('Number of workers : ', eval_config['n_workers']) print('') sleep(2.0) # Init files and tensorboard model_name = agent_config['name'] checkpoints_dir = os.path.join(model_name, files_config['checkpoints_dir']) tensorboard_log_dir = os.path.join(model_name, files_config['tensorboard_log_dir']) results_log_path = os.path.join(model_name, files_config['results_log_path']) # fix random seed if config['Seed'] is None: np.random.seed(seed=42) else: np.random.seed(int(seed)) print('\n\n') env = Env() # if train from scratch if training_config["init_checkpoint"] == 0: # initialize dir for tensorboard flush_or_create(tensorboard_log_dir) # initialize dir for checkpoitns flush_or_create(checkpoints_dir) # init agent and network from scratch agent = ActorCriticAgent(agent_config, network_config, checkpoints_dir, tensorboard_log_dir) # initialize iteration number start = 0 # else restart training from last checkpoint else: agent = ActorCriticAgent(agent_config, network_config, checkpoints_dir, tensorboard_log_dir, restore=True) print('\nnetwork restored from checkpoint # ', latest_checkpoint) print('') start = latest_checkpoint # intialize the summary writer and results log file log_file = open(results_log_path, "wb+") # open log file to write in during evaluation display_every = training_config["display_every"] n_games_train = training_config["n_games"] n_workers_train = training_config["n_workers"] T_update_net = training_config["T_update_net"] T_update_target_net = training_config["T_update_target_net"] n_games_eval = eval_config["n_games"] n_workers_eval = eval_config["n_workers"] prefill_buffer = training_config["prefill_buffer"] # gamma = agent_config['gamma'] summary_dict = dict({}) data_buffer = Buffer(capacity=training_config['buffer_size']) logger = logging.getLogger(__name__) if prefill_buffer: # populate buffer with intial data from random games print('\nPopulating Buffer ... \n') populate_buffer(agent, n_workers_train, data_buffer) print('\n\n') print('Starting training\n\n') batch_size = training_config['batch_size'] for it in tqdm(np.arange(start, training_config["n_iter"]), desc="parallel gameplay iterations"): # play games to generate data and train the network env.reset() try: agent.train(env, n_games_train, data_buffer, batch_size, n_workers_train, display_every, T_update_net) except Exception as error: print('\n\n#### AN ERROR OCCURED WHILE TRAINING ####\n\n') agent.net.summary_writer.close() agent.net.sess.close() log_file.close() logger.error(error) raise agent.net.save_checkpoint(checkpoints_dir, it=it + 1) # play games with latest checkpoint and track average final reward results = agent.evaluate(env, n_games_eval, n_workers_eval) # save results pickle.dump(results, log_file) print('') agent.net.summary_writer.close() agent.net.sess.close() log_file.close() print('End of training')