def __init__(self, args): self.env = create_env(args['env'], args) self.args = args self.show_actiongrid = False self.actiongrid_mode = 'gray' self.alpha = 0.5 self.actiongrid_depth = -1 self.actiongrid_clip = True self.show_stategrid = False self.manual_control = False #np.zeros(self.env.action_space.shape) self.manual_action = self.env.action_space.sample() self.manual_action_index = 0 self.manual_increment_step = 16. self.manual_increments = ( self.env.action_space.high - self.env.action_space.low) / self.manual_increment_step self.paused = False self.advance_step = False self.terminate_episode = False self.quit = False
# Based on # https://github.com/pytorch/examples/tree/master/mnist_hogwild # Training settings # Implemented multiprocessing using locks but was not beneficial. Hogwild # training was far superior if __name__ == '__main__': args = parser.parse_args() torch.manual_seed(args.seed) if args.gpu_ids == -1: args.gpu_ids = [-1] else: torch.cuda.manual_seed(args.seed) mp.set_start_method('spawn') env = create_env(args.env, args) # Create model AC = importlib.import_module(args.model_name) shared_model = AC.ActorCritic(env.observation_space, env.action_space, args.stack_frames, args) EXP = importlib.import_module(args.expert_model_name) shared_expert = EXP.ActorCritic(env.observation_space, env.action_space, args.expert_stack_frames, args) if args.load: print('Loading model from: {0}{1}.dat'.format(args.load_model_dir, args.env)) saved_state = torch.load('{0}{1}.dat'.format(args.load_model_dir, args.env), map_location=lambda storage, loc: storage) shared_model.load_state_dict(saved_state)
def evaluate(args): start_time = time.time() torch.set_default_tensor_type('torch.FloatTensor') pthfile = torch.load(args['load_file'], map_location=lambda storage, loc: storage.cpu()) # Create the output directory output_dir = os.path.join( os.path.dirname(args['load_file']), args['output_directory'], os.path.split(args['env'])[1] + 'evaluation-' + datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S.%f")) try: os.makedirs(output_dir) except OSError: if not os.path.isdir(output_dir): raise print('saving to: ' + output_dir + '/') start_log_setup = time.time() log = {} setup_logger('test.log', r'{0}/test.log'.format(output_dir)) log['test.log'] = logging.getLogger('test.log') end_log_setup = time.time() print('single evaluate log setup: %d' % (end_log_setup - start_log_setup)) gpu_id = args['gpu_ids'][-1] torch.manual_seed(args['seed']) npr.seed(args['seed'] + 1) if gpu_id >= 0: torch.cuda.manual_seed(args['seed']) for k in args.keys(): log['test.log'].info('{0}: {1}'.format(k, args[k])) env = create_env(args['env'], args) player = Agent(None, env, args, None) # Wrap the environment so that it saves a video if args['render_video']: player.env = gym.wrappers.Monitor(player.env, output_dir, force=True) start_model = time.time() AC = importlib.import_module(args['model_name']) player.model = AC.ActorCritic(env.observation_space, env.action_space, args['stack_frames'], args) player.gpu_id = gpu_id if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model = player.model.cuda() if args['load_best']: player.model.load_state_dict(pthfile['best_state_dict']) else: player.model.load_state_dict(pthfile['state_dict']) player.model.eval() end_model = time.time() print('single evaluate model setup time: %d' % (end_model - start_model)) # Keep track of returns all_episode_returns = [] for i_episode in range(args['num_episodes']): player.state, player.info = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.eps_len = 0 reward_sum = 0 episode_step = 0 while True: player.action_test() reward_sum += player.reward episode_step += 1 if player.done: all_episode_returns.append(reward_sum) #num_tests += 1 #reward_total_sum += reward_sum #reward_mean = reward_total_sum / num_tests log['test.log'].info( "Episode_length, {0}, reward_sum, {1}".format( player.eps_len, reward_sum)) break end_episodes = time.time() print('single evaluate time for %d episodes: %d' % (args['num_episodes'], end_episodes - end_model)) print('single evaluate seconds per episode: %d' % ((end_episodes - end_model) / args['num_episodes'])) all_episode_returns = np.array(all_episode_returns) all_episode_successes = np.array(all_episode_returns > 300., dtype=np.float32) evaluation_statistics = { 'Mean Return': np.mean(all_episode_returns), 'Std Return': np.std(all_episode_returns), 'Min Return': np.min(all_episode_returns), 'Max Return': np.max(all_episode_returns), 'Mean Success': np.mean(all_episode_successes), 'Number Successes': np.sum(all_episode_successes), 'Number Total': args['num_episodes'], 'Std Success': np.std(all_episode_successes), 'Min Success': np.min(all_episode_successes), 'Max Success': np.max(all_episode_successes), 'all_episode_returns': all_episode_returns, 'all_episode_successes': all_episode_successes, } # Save raw data to a file torch.save( { 'all_episode_returns': all_episode_returns, 'all_episode_successes': all_episode_successes, }, os.path.join(output_dir, 'evaluation_statistics.pth')) print( 'Average Episodic Return: \n\tmean: {0}\n\tstd: {1}\n\tmin: {2}\n\t \ max: {3}'.format(np.mean(all_episode_returns), np.std(all_episode_returns), np.min(all_episode_returns), np.max(all_episode_returns))) print( 'Average Episodic Success: \n\tmean: {0} ({1}/{2})\n\tstd: {3}\n\t \ min: {4}\n\tmax: {5}'.format(np.mean(all_episode_successes), np.sum(all_episode_successes), args['num_episodes'], np.std(all_episode_successes), np.min(all_episode_successes), np.max(all_episode_successes))) # Shut down logging system and close open file handles start_log_shutdown = time.time() logging.shutdown() end_time = time.time() print('single evaluate log shutdown: %d' % (end_time - start_log_shutdown)) print('single evaluate total time for %d episodes: %d' % (args['num_episodes'], end_time - start_time)) print('single evaluate overall seconds per episode: %f' % ((end_time - start_time) / args['num_episodes'])) return evaluation_statistics
def train(rank, args, shared_model, shared_expert, optimizer): ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = create_env(args.env, args) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) env.seed(args.seed + rank) player = Agent(None, None, env, args, None) player.gpu_id = gpu_id AC = importlib.import_module(args.model_name) player.model = AC.ActorCritic( env.observation_space, env.action_space, args.stack_frames, args) EXP = importlib.import_module(args.expert_model_name) player.expert = EXP.ActorCritic( env.observation_space, env.action_space, args.expert_stack_frames, args) player.expert.load_state_dict(shared_expert.state_dict()) player.state, player.info = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.expert = player.expert.cuda() player.model.train() step_count = 0 while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.memory = player.model.initialize_memory() player.expert_memory = player.expert.initialize_memory() else: player.memory = player.model.initialize_memory() player.expert_memory = player.expert.initialize_memory() else: player.memory = player.model.reinitialize_memory(player.memory) player.expert_memory = player.expert.reinitialize_memory(player.expert_memory) for step in range(args.num_steps): player.action_train() if player.done: break if player.done: player.eps_len = 0 state, player.info = player.env.reset() player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() # Imitation + Entropy loss policy_loss = 0 for i in reversed(range(len(player.rewards))): policy_loss = policy_loss \ + player.ces[i].sum() \ #- (0.01 * player.entropies[i].sum()) player.model.zero_grad() policy_loss.backward() ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions() step_count += 1 if (rank == 0) and (step_count%500) == 0: print('Model weight/gradient L-inf norm:') def _linf_norm(x): return str(torch.max(torch.abs(x))[0].data.item()) for pname, param in player.model.named_parameters(): pgradnorm = str(0.) if param.grad is not None: pgradnorm = _linf_norm(param.grad) print('\t'+pname+' '+_linf_norm(param)+'/'+pgradnorm)
def train(rank, args, shared_model, optimizer, thread_step_counter, global_step_counter): gpu_id = args['gpu_ids'][rank % len(args['gpu_ids'])] if args['experiment_id'] == '': ptitle('Training Agent: {}'.format(rank)) else: ptitle('EXPID: {} Training Agent: {}'.format(args['experiment_id'], rank)) torch.manual_seed(args['seed'] + rank) if gpu_id >= 0: torch.cuda.manual_seed(args['seed'] + rank) env = create_env(args['env'], args) if optimizer is None: if args['optimizer'] == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args['lr']) if args['optimizer'] == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args['lr']) env.seed(args['seed'] + rank) player = Agent(None, env, args, None) player.gpu_id = gpu_id AC = importlib.import_module(args['model_name']) player.model = AC.ActorCritic( env.observation_space, env.action_space, args['stack_frames'], args) player.state, player.info = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() step_count = 0 while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.memory = player.model.initialize_memory() else: player.memory = player.model.initialize_memory() else: player.memory = player.model.reinitialize_memory(player.memory) for step in range(args['num_steps']): player.action_train() if player.done: break if player.done: player.eps_len = 0 state, player.info = player.env.reset() player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() if gpu_id >= 0: with torch.cuda.device(gpu_id): R = torch.zeros(1, 1).cuda() else: R = torch.zeros(1, 1) if not player.done: state = player.state state = state.unsqueeze(0) value, _, _, player.memory = player.model( (Variable(state), player.info, player.memory)) R = value.data player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = torch.zeros(1, 1).cuda() else: gae = torch.zeros(1, 1) for i in reversed(range(len(player.rewards))): R = args['gamma'] * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion # print(player.rewards[i]) delta_t = player.rewards[i] + args['gamma'] * \ player.values[i + 1].data - player.values[i].data gae = gae * args['gamma'] * args['tau'] + delta_t policy_loss = policy_loss - \ (player.log_probs[i].sum() * Variable(gae)) - \ (0.01 * player.entropies[i].sum()) player.model.zero_grad() (policy_loss + 0.5 * value_loss).backward()#retain_graph=True) ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions() # Update mutexed number of steps with thread_step_counter.get_lock(): thread_step_counter.value += 1 step_count += 1 if args['train_until'] is not None \ and global_step_counter.value > args['train_until']: break '''
def main(args): torch.manual_seed(args['seed']) npr.seed(args['seed'] + 1) # Create the save directory try: os.makedirs(args['save_directory']) except OSError: if not os.path.isdir(args['save_directory']): raise print('saving to: ' + args['save_directory'] + '/') if args['gpu_ids'] == -1: args['gpu_ids'] = [-1] else: torch.cuda.manual_seed(args['seed']) mp.set_start_method('spawn') env = create_env(args['env'], args) # Create model AC = importlib.import_module(args['model_name']) shared_model = AC.ActorCritic(env.observation_space, env.action_space, args['stack_frames'], args) shared_model.share_memory() if args['shared_optimizer']: if args['optimizer'] == 'RMSprop': optimizer = SharedRMSprop(shared_model.parameters(), lr=args['lr']) if args['optimizer'] == 'Adam': optimizer = SharedAdam(shared_model.parameters(), lr=args['lr'], amsgrad=args['amsgrad']) optimizer.share_memory() else: optimizer = None # Keep track of all steps taken in each thread all_step_counters = [mp.Value('i', 0) for i in range(args['workers'])] global_step_counter = mp.Value('i', 0) # Keep track of stats if we want to load from a checkpoint all_scores = [] all_global_steps = [] if args['load_file'] != '': print('Loading model from: {0}'.format(args['load_file'])) pthfile = torch.load('{0}'.format(args['load_file']), map_location=lambda storage, loc: storage.cpu()) if args['load_best']: shared_model.load_state_dict(pthfile['best_state_dict']) if optimizer is not None: optimizer.load_state_dict(pthfile['best_optimizer']) else: shared_model.load_state_dict(pthfile['state_dict']) if optimizer is not None: optimizer.load_state_dict(pthfile['optimizer']) all_scores = pthfile['all_scores'] all_global_steps = pthfile['all_global_steps'] # Only test process will write to this to avoid each thread waiting every # gradient step to update. Threads will read from global_step_counter to # know when to terminate if args['test_until'] is used if len(all_global_steps) > 0: # This increment doesn't have to be atomic with global_step_counter.get_lock(): global_step_counter.value = all_global_steps[-1] processes = [] p = mp.Process(target=test, args=(args, shared_model, optimizer, all_scores, all_global_steps, all_step_counters, global_step_counter)) p.start() processes.append(p) time.sleep(0.1) for rank in range(0, args['workers']): p = mp.Process(target=train, args=(rank, args, shared_model, optimizer, all_step_counters[rank], global_step_counter)) p.start() processes.append(p) time.sleep(0.1) for p in processes: time.sleep(0.1) p.join()
def test(args, shared_model): ptitle('Test Agent') gpu_id = args.gpu_ids[-1] log = {} setup_logger( '{}_log'.format(args.env), r'{0}{1}{2}_log'.format(args.log_dir, args.save_prefix, args.env)) log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format( args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) torch.manual_seed(args.seed) if gpu_id >= 0: torch.cuda.manual_seed(args.seed) env = create_env(args.env, args) reward_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, None, env, args, None) player.gpu_id = gpu_id AC = importlib.import_module(args.model_name) player.model = AC.ActorCritic(env.observation_space, env.action_space, args.stack_frames, args) player.state, player.info = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model = player.model.cuda() player.state = player.state.cuda() player.model.eval() episode_count = 0 all_scores = [] max_score = 0 while True: if player.done: episode_count += 1 if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) player.action_test() reward_sum += player.reward if player.done: num_tests += 1 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests log['{}_log'.format(args.env)].info( "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean)) # Plot scores every 5 episodes all_scores.append(reward_sum) if (episode_count % 5 == 0): plt.clf() plt.plot(range(len(all_scores)), all_scores) plt.title('Test Episode Returns') plt.xlabel('Test Episode') plt.ylabel('Return') plt.savefig('{0}{1}{2}.png'.format(args.log_dir, args.save_prefix, args.env)) if args.save_max and reward_sum >= max_score: max_score = reward_sum if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}{1}{2}.dat'.format(args.save_model_dir, args.save_prefix, args.env)) else: state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}{1}{2}.dat'.format(args.save_model_dir, args.save_prefix, args.env)) reward_sum = 0 player.eps_len = 0 state, player.info = player.env.reset() time.sleep(60) player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda()