Exemple #1
0
    def __init__(self, args):

        self.env = create_env(args['env'], args)
        self.args = args

        self.show_actiongrid = False
        self.actiongrid_mode = 'gray'
        self.alpha = 0.5
        self.actiongrid_depth = -1
        self.actiongrid_clip = True
        self.show_stategrid = False

        self.manual_control = False
        #np.zeros(self.env.action_space.shape)
        self.manual_action = self.env.action_space.sample()
        self.manual_action_index = 0
        self.manual_increment_step = 16.
        self.manual_increments = (
            self.env.action_space.high -
            self.env.action_space.low) / self.manual_increment_step

        self.paused = False
        self.advance_step = False
        self.terminate_episode = False
        self.quit = False
# Based on
# https://github.com/pytorch/examples/tree/master/mnist_hogwild
# Training settings
# Implemented multiprocessing using locks but was not beneficial. Hogwild
# training was far superior

if __name__ == '__main__':
    args = parser.parse_args()
    torch.manual_seed(args.seed)
    if args.gpu_ids == -1:
        args.gpu_ids = [-1]
    else:
        torch.cuda.manual_seed(args.seed)
        mp.set_start_method('spawn')
    env = create_env(args.env, args)

    # Create model
    AC = importlib.import_module(args.model_name)
    shared_model = AC.ActorCritic(env.observation_space, env.action_space,
                                  args.stack_frames, args)
    EXP = importlib.import_module(args.expert_model_name)
    shared_expert = EXP.ActorCritic(env.observation_space, env.action_space,
                                    args.expert_stack_frames, args)
    if args.load:
        print('Loading model from: {0}{1}.dat'.format(args.load_model_dir,
                                                      args.env))
        saved_state = torch.load('{0}{1}.dat'.format(args.load_model_dir,
                                                     args.env),
                                 map_location=lambda storage, loc: storage)
        shared_model.load_state_dict(saved_state)
Exemple #3
0
def evaluate(args):
    start_time = time.time()
    torch.set_default_tensor_type('torch.FloatTensor')

    pthfile = torch.load(args['load_file'],
                         map_location=lambda storage, loc: storage.cpu())

    # Create the output directory
    output_dir = os.path.join(
        os.path.dirname(args['load_file']), args['output_directory'],
        os.path.split(args['env'])[1] + 'evaluation-' +
        datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S.%f"))
    try:
        os.makedirs(output_dir)
    except OSError:
        if not os.path.isdir(output_dir):
            raise
    print('saving to: ' + output_dir + '/')

    start_log_setup = time.time()
    log = {}
    setup_logger('test.log', r'{0}/test.log'.format(output_dir))
    log['test.log'] = logging.getLogger('test.log')
    end_log_setup = time.time()
    print('single evaluate log setup: %d' % (end_log_setup - start_log_setup))

    gpu_id = args['gpu_ids'][-1]

    torch.manual_seed(args['seed'])
    npr.seed(args['seed'] + 1)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args['seed'])

    for k in args.keys():
        log['test.log'].info('{0}: {1}'.format(k, args[k]))

    env = create_env(args['env'], args)
    player = Agent(None, env, args, None)

    # Wrap the environment so that it saves a video
    if args['render_video']:
        player.env = gym.wrappers.Monitor(player.env, output_dir, force=True)

    start_model = time.time()
    AC = importlib.import_module(args['model_name'])
    player.model = AC.ActorCritic(env.observation_space, env.action_space,
                                  args['stack_frames'], args)

    player.gpu_id = gpu_id
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.model = player.model.cuda()

    if args['load_best']:
        player.model.load_state_dict(pthfile['best_state_dict'])
    else:
        player.model.load_state_dict(pthfile['state_dict'])
    player.model.eval()

    end_model = time.time()
    print('single evaluate model setup time: %d' % (end_model - start_model))

    # Keep track of returns
    all_episode_returns = []
    for i_episode in range(args['num_episodes']):
        player.state, player.info = player.env.reset()
        player.state = torch.from_numpy(player.state).float()
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                player.state = player.state.cuda()
        player.eps_len = 0
        reward_sum = 0
        episode_step = 0
        while True:
            player.action_test()
            reward_sum += player.reward
            episode_step += 1

            if player.done:
                all_episode_returns.append(reward_sum)
                #num_tests += 1
                #reward_total_sum += reward_sum
                #reward_mean = reward_total_sum / num_tests
                log['test.log'].info(
                    "Episode_length, {0}, reward_sum, {1}".format(
                        player.eps_len, reward_sum))
                break
    end_episodes = time.time()
    print('single evaluate time for %d episodes: %d' %
          (args['num_episodes'], end_episodes - end_model))
    print('single evaluate seconds per episode: %d' %
          ((end_episodes - end_model) / args['num_episodes']))
    all_episode_returns = np.array(all_episode_returns)
    all_episode_successes = np.array(all_episode_returns > 300.,
                                     dtype=np.float32)

    evaluation_statistics = {
        'Mean Return': np.mean(all_episode_returns),
        'Std Return': np.std(all_episode_returns),
        'Min Return': np.min(all_episode_returns),
        'Max Return': np.max(all_episode_returns),
        'Mean Success': np.mean(all_episode_successes),
        'Number Successes': np.sum(all_episode_successes),
        'Number Total': args['num_episodes'],
        'Std Success': np.std(all_episode_successes),
        'Min Success': np.min(all_episode_successes),
        'Max Success': np.max(all_episode_successes),
        'all_episode_returns': all_episode_returns,
        'all_episode_successes': all_episode_successes,
    }

    # Save raw data to a file
    torch.save(
        {
            'all_episode_returns': all_episode_returns,
            'all_episode_successes': all_episode_successes,
        }, os.path.join(output_dir, 'evaluation_statistics.pth'))

    print(
        'Average Episodic Return: \n\tmean: {0}\n\tstd: {1}\n\tmin: {2}\n\t  \
            max: {3}'.format(np.mean(all_episode_returns),
                             np.std(all_episode_returns),
                             np.min(all_episode_returns),
                             np.max(all_episode_returns)))
    print(
        'Average Episodic Success: \n\tmean: {0} ({1}/{2})\n\tstd: {3}\n\t   \
            min: {4}\n\tmax: {5}'.format(np.mean(all_episode_successes),
                                         np.sum(all_episode_successes),
                                         args['num_episodes'],
                                         np.std(all_episode_successes),
                                         np.min(all_episode_successes),
                                         np.max(all_episode_successes)))

    # Shut down logging system and close open file handles
    start_log_shutdown = time.time()
    logging.shutdown()

    end_time = time.time()
    print('single evaluate log shutdown: %d' % (end_time - start_log_shutdown))
    print('single evaluate total time for %d episodes: %d' %
          (args['num_episodes'], end_time - start_time))
    print('single evaluate overall seconds per episode: %f' %
          ((end_time - start_time) / args['num_episodes']))
    return evaluation_statistics
def train(rank, args, shared_model, shared_expert, optimizer):
    ptitle('Training Agent: {}'.format(rank))
    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]
    torch.manual_seed(args.seed + rank)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed + rank)
    env = create_env(args.env, args)
    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    env.seed(args.seed + rank)
    player = Agent(None, None, env, args, None)
    player.gpu_id = gpu_id
    AC = importlib.import_module(args.model_name)
    player.model = AC.ActorCritic(
        env.observation_space, env.action_space, args.stack_frames, args)
    EXP = importlib.import_module(args.expert_model_name)
    player.expert = EXP.ActorCritic(
        env.observation_space, env.action_space, args.expert_stack_frames, args)
    player.expert.load_state_dict(shared_expert.state_dict())

    player.state, player.info = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.state = player.state.cuda()
            player.model = player.model.cuda()
            player.expert = player.expert.cuda()
    player.model.train()

    step_count = 0
    while True:
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                player.model.load_state_dict(shared_model.state_dict())
        else:
            player.model.load_state_dict(shared_model.state_dict())
        if player.done:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.memory = player.model.initialize_memory()
                    player.expert_memory = player.expert.initialize_memory()
            else:
                player.memory = player.model.initialize_memory()
                player.expert_memory = player.expert.initialize_memory()
        else:
            player.memory = player.model.reinitialize_memory(player.memory)
            player.expert_memory = player.expert.reinitialize_memory(player.expert_memory)

        for step in range(args.num_steps):

            player.action_train()

            if player.done:
                break

        if player.done:
            player.eps_len = 0
            state, player.info = player.env.reset()
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()

        # Imitation + Entropy loss
        policy_loss = 0
        for i in reversed(range(len(player.rewards))):
            policy_loss = policy_loss \
                          + player.ces[i].sum() \
                          #- (0.01 * player.entropies[i].sum())

        player.model.zero_grad()
        policy_loss.backward()
        ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0)
        optimizer.step()
        player.clear_actions()

        step_count += 1
        if (rank == 0) and (step_count%500) == 0:
            print('Model weight/gradient L-inf norm:')
            def _linf_norm(x):
                return str(torch.max(torch.abs(x))[0].data.item())
            for pname, param in player.model.named_parameters():
                pgradnorm = str(0.)
                if param.grad is not None:
                    pgradnorm = _linf_norm(param.grad)
                    print('\t'+pname+' '+_linf_norm(param)+'/'+pgradnorm)
Exemple #5
0
def train(rank, args, shared_model, optimizer, thread_step_counter,
        global_step_counter):
    gpu_id = args['gpu_ids'][rank % len(args['gpu_ids'])]
    if args['experiment_id'] == '':
        ptitle('Training Agent: {}'.format(rank))
    else:
        ptitle('EXPID: {} Training Agent: {}'.format(args['experiment_id'], rank))
    torch.manual_seed(args['seed'] + rank)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args['seed'] + rank)
    env = create_env(args['env'], args)
    if optimizer is None:
        if args['optimizer'] == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args['lr'])
        if args['optimizer'] == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(), lr=args['lr'])

    env.seed(args['seed'] + rank)
    player = Agent(None, env, args, None)
    player.gpu_id = gpu_id
    AC = importlib.import_module(args['model_name'])
    player.model = AC.ActorCritic(
        env.observation_space, env.action_space, args['stack_frames'], args)

    player.state, player.info = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.state = player.state.cuda()
            player.model = player.model.cuda()
    player.model.train()

    step_count = 0
    while True:
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                player.model.load_state_dict(shared_model.state_dict())
        else:
            player.model.load_state_dict(shared_model.state_dict())
        if player.done:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.memory = player.model.initialize_memory()
            else:
                player.memory = player.model.initialize_memory()
        else:
            player.memory = player.model.reinitialize_memory(player.memory)

        for step in range(args['num_steps']):

            player.action_train()

            if player.done:
                break

        if player.done:
            player.eps_len = 0
            state, player.info = player.env.reset()
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()

        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                R = torch.zeros(1, 1).cuda()
        else:
            R = torch.zeros(1, 1)
        if not player.done:
            state = player.state
            state = state.unsqueeze(0)
            value, _, _, player.memory = player.model(
                (Variable(state), player.info, player.memory))
            R = value.data

        player.values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                gae = torch.zeros(1, 1).cuda()
        else:
            gae = torch.zeros(1, 1)
        for i in reversed(range(len(player.rewards))):
            R = args['gamma'] * R + player.rewards[i]
            advantage = R - player.values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
  #          print(player.rewards[i])
            delta_t = player.rewards[i] + args['gamma'] * \
                player.values[i + 1].data - player.values[i].data

            gae = gae * args['gamma'] * args['tau'] + delta_t

            policy_loss = policy_loss - \
                (player.log_probs[i].sum() * Variable(gae)) - \
                (0.01 * player.entropies[i].sum())

        player.model.zero_grad()
        (policy_loss + 0.5 * value_loss).backward()#retain_graph=True)
        ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0)
        optimizer.step()
        player.clear_actions()

        # Update mutexed number of steps
        with thread_step_counter.get_lock():
            thread_step_counter.value += 1

        step_count += 1

        if args['train_until'] is not None  \
            and global_step_counter.value > args['train_until']:
            break
        '''
def main(args):
    torch.manual_seed(args['seed'])
    npr.seed(args['seed'] + 1)

    # Create the save directory
    try:
        os.makedirs(args['save_directory'])
    except OSError:
        if not os.path.isdir(args['save_directory']):
            raise
    print('saving to: ' + args['save_directory'] + '/')

    if args['gpu_ids'] == -1:
        args['gpu_ids'] = [-1]
    else:
        torch.cuda.manual_seed(args['seed'])
        mp.set_start_method('spawn')
    env = create_env(args['env'], args)

    # Create model
    AC = importlib.import_module(args['model_name'])
    shared_model = AC.ActorCritic(env.observation_space, env.action_space,
                                  args['stack_frames'], args)

    shared_model.share_memory()

    if args['shared_optimizer']:
        if args['optimizer'] == 'RMSprop':
            optimizer = SharedRMSprop(shared_model.parameters(), lr=args['lr'])
        if args['optimizer'] == 'Adam':
            optimizer = SharedAdam(shared_model.parameters(),
                                   lr=args['lr'],
                                   amsgrad=args['amsgrad'])
        optimizer.share_memory()
    else:
        optimizer = None

    # Keep track of all steps taken in each thread
    all_step_counters = [mp.Value('i', 0) for i in range(args['workers'])]
    global_step_counter = mp.Value('i', 0)

    # Keep track of stats if we want to load from a checkpoint
    all_scores = []
    all_global_steps = []
    if args['load_file'] != '':
        print('Loading model from: {0}'.format(args['load_file']))
        pthfile = torch.load('{0}'.format(args['load_file']),
                             map_location=lambda storage, loc: storage.cpu())
        if args['load_best']:
            shared_model.load_state_dict(pthfile['best_state_dict'])
            if optimizer is not None:
                optimizer.load_state_dict(pthfile['best_optimizer'])
        else:
            shared_model.load_state_dict(pthfile['state_dict'])
            if optimizer is not None:
                optimizer.load_state_dict(pthfile['optimizer'])
            all_scores = pthfile['all_scores']
            all_global_steps = pthfile['all_global_steps']

    # Only test process will write to this to avoid each thread waiting every
    # gradient step to update. Threads will read from global_step_counter to
    # know when to terminate if args['test_until'] is used
    if len(all_global_steps) > 0:
        # This increment doesn't have to be atomic
        with global_step_counter.get_lock():
            global_step_counter.value = all_global_steps[-1]

    processes = []

    p = mp.Process(target=test,
                   args=(args, shared_model, optimizer, all_scores,
                         all_global_steps, all_step_counters,
                         global_step_counter))
    p.start()
    processes.append(p)
    time.sleep(0.1)
    for rank in range(0, args['workers']):
        p = mp.Process(target=train,
                       args=(rank, args, shared_model, optimizer,
                             all_step_counters[rank], global_step_counter))
        p.start()
        processes.append(p)
        time.sleep(0.1)
    for p in processes:
        time.sleep(0.1)
        p.join()
def test(args, shared_model):
    ptitle('Test Agent')
    gpu_id = args.gpu_ids[-1]
    log = {}
    setup_logger(
        '{}_log'.format(args.env),
        r'{0}{1}{2}_log'.format(args.log_dir, args.save_prefix, args.env))
    log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format(
        args.env))
    d_args = vars(args)
    for k in d_args.keys():
        log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k]))

    torch.manual_seed(args.seed)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed)
    env = create_env(args.env, args)
    reward_sum = 0
    start_time = time.time()
    num_tests = 0
    reward_total_sum = 0
    player = Agent(None, None, env, args, None)
    player.gpu_id = gpu_id

    AC = importlib.import_module(args.model_name)
    player.model = AC.ActorCritic(env.observation_space, env.action_space,
                                  args.stack_frames, args)

    player.state, player.info = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.model = player.model.cuda()
            player.state = player.state.cuda()
    player.model.eval()

    episode_count = 0
    all_scores = []
    max_score = 0
    while True:
        if player.done:
            episode_count += 1
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.model.load_state_dict(shared_model.state_dict())
            else:
                player.model.load_state_dict(shared_model.state_dict())

        player.action_test()
        reward_sum += player.reward

        if player.done:
            num_tests += 1
            reward_total_sum += reward_sum
            reward_mean = reward_total_sum / num_tests
            log['{}_log'.format(args.env)].info(
                "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}"
                .format(
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    reward_sum, player.eps_len, reward_mean))

            # Plot scores every 5 episodes
            all_scores.append(reward_sum)
            if (episode_count % 5 == 0):
                plt.clf()
                plt.plot(range(len(all_scores)), all_scores)
                plt.title('Test Episode Returns')
                plt.xlabel('Test Episode')
                plt.ylabel('Return')
                plt.savefig('{0}{1}{2}.png'.format(args.log_dir,
                                                   args.save_prefix, args.env))

            if args.save_max and reward_sum >= max_score:
                max_score = reward_sum
                if gpu_id >= 0:
                    with torch.cuda.device(gpu_id):
                        state_to_save = player.model.state_dict()
                        torch.save(
                            state_to_save,
                            '{0}{1}{2}.dat'.format(args.save_model_dir,
                                                   args.save_prefix, args.env))
                else:
                    state_to_save = player.model.state_dict()
                    torch.save(
                        state_to_save,
                        '{0}{1}{2}.dat'.format(args.save_model_dir,
                                               args.save_prefix, args.env))

            reward_sum = 0
            player.eps_len = 0
            state, player.info = player.env.reset()
            time.sleep(60)
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()