Ejemplo n.º 1
0
def get_modules_for_notebook(path, params):
    EPOCH = 'best'
    POLICY_FILE = path + 'policy_checkpoints/policy_{}.pkl'.format(EPOCH)
    policy_language_model, reward_language_model = config.get_language_models(
        params)

    onehot_encoder = config.get_one_hot_encoder(params['all_descriptions'])
    # Define the goal sampler for training
    goal_sampler = GoalSampler(policy_language_model=policy_language_model,
                               reward_language_model=reward_language_model,
                               goal_dim=policy_language_model.goal_dim,
                               one_hot_encoder=onehot_encoder,
                               params=params)

    reward_function = config.get_reward_function(goal_sampler, params)
    if params['conditions']['reward_function'] == 'learned_lstm':
        reward_function.restore_from_checkpoint(
            path +
            'reward_checkpoints/reward_func_{}_checkpoint'.format(EPOCH))
    policy_language_model.set_reward_function(reward_function)
    if reward_language_model is not None:
        reward_language_model.set_reward_function(reward_function)
    goal_sampler.update_discovered_goals(params['all_descriptions'],
                                         episode_count=0,
                                         epoch=0)

    # Define learning algorithm
    policy = config.configure_learning_algo(reward_function=reward_function,
                                            goal_sampler=goal_sampler,
                                            params=params)

    policy.load_params(POLICY_FILE)
    return policy_language_model, reward_language_model, policy, reward_function, goal_sampler
Ejemplo n.º 2
0
def launch(**kwargs):
    # Fork for multi-CPU MPI implementation.
    rank = fork(kwargs['num_cpu'])

    # Configure everything and log parameters
    params, rank_seed = config.configure_everything(rank, **kwargs)

    # Define language model
    policy_language_model, reward_language_model = config.get_language_models(
        params)

    # Define the one-hot_encoder (vocabulary of words and max_seq_legnth)
    onehot_encoder = config.get_one_hot_encoder(params['train_descriptions'] +
                                                params['test_descriptions'])

    # Define the goal sampler for training
    goal_sampler = GoalSampler(policy_language_model=policy_language_model,
                               reward_language_model=reward_language_model,
                               goal_dim=policy_language_model.goal_dim,
                               one_hot_encoder=onehot_encoder,
                               params=params)

    # Define reward function
    reward_function = config.get_reward_function(goal_sampler=goal_sampler,
                                                 params=params)
    oracle_reward_function = config.get_oracle_reward_function(
        goal_sampler, params)

    policy_language_model.set_reward_function(reward_function)
    if reward_language_model is not None:
        reward_language_model.set_reward_function(reward_function)

    # Define the goal sampler for evaluation
    eval_goal_sampler = EvalGoalSampler(
        policy_language_model=policy_language_model,
        one_hot_encoder=onehot_encoder,
        params=params)
    # Give reward function to goal sampler to track metrics
    goal_sampler.store_reward_function(reward_function)

    # Define learning algorithm
    policy = config.configure_learning_algo(reward_function=reward_function,
                                            goal_sampler=goal_sampler,
                                            params=params)

    # Define the social partner
    social_partner = SocialPartner(
        oracle_reward_function=oracle_reward_function,
        goal_sampler=goal_sampler,
        **params['social_partner_params'],
        params=params)

    # Define the data processor
    data_processor = DataProcessor(
        reward_function=reward_function,
        oracle_reward_function=oracle_reward_function,
        goal_sampler=goal_sampler,
        params=params)

    # Define the worker to interact with the environment (training and evaluation)
    training_worker = RolloutWorker(make_env=params['make_env'],
                                    policy=policy,
                                    reward_function=reward_function,
                                    params=params,
                                    **params['training_rollout_params'])
    training_worker.seed(rank_seed)

    evaluation_worker = RolloutWorker(make_env=params['make_env'],
                                      policy=policy,
                                      reward_function=reward_function,
                                      params=params,
                                      **params['evaluation_rollout_params'],
                                      render=False)
    evaluation_worker.seed(rank_seed * 10)

    stats_logger = StatsLogger(goal_sampler=goal_sampler,
                               data_processor=data_processor,
                               training_worker=training_worker,
                               evaluation_worker=evaluation_worker,
                               reward_function=reward_function,
                               policy=policy,
                               params=params)

    train(logdir=params['experiment_params']['logdir'],
          policy=policy,
          training_worker=training_worker,
          goal_sampler=goal_sampler,
          eval_goal_sampler=eval_goal_sampler,
          evaluation_worker=evaluation_worker,
          social_partner=social_partner,
          n_epochs=params['experiment_params']['n_epochs'],
          n_test_rollouts=params['experiment_params']['n_test_rollouts'],
          n_cycles=params['experiment_params']['n_cycles'],
          n_batches=params['experiment_params']['n_batches'],
          reward_function=reward_function,
          stats_logger=stats_logger,
          data_processor=data_processor,
          params=params)
Ejemplo n.º 3
0
def main(policy_file, seed, n_test_rollouts, render):
    set_global_seeds(seed)

    # Load params
    with open(PARAMS_FILE) as json_file:
        params = json.load(json_file)

    if not render:
        env = 'PlaygroundNavigation-v1'
    else:
        env = 'PlaygroundNavigationRender-v1'
    params, rank_seed = config.configure_everything(
        rank=0,
        seed=seed,
        num_cpu=params['experiment_params']['n_cpus'],
        env=env,
        trial_id=0,
        n_epochs=10,
        reward_function=params['conditions']['reward_function'],
        policy_encoding=params['conditions']['policy_encoding'],
        feedback_strategy=params['conditions']['feedback_strategy'],
        policy_architecture=params['conditions']['policy_architecture'],
        goal_invention=params['conditions']['goal_invention'],
        reward_checkpoint=params['conditions']['reward_checkpoint'],
        rl_positive_ratio=params['conditions']['rl_positive_ratio'],
        p_partner_availability=params['conditions']
        ['p_social_partner_availability'],
        imagination_method=params['conditions']['imagination_method'],
        git_commit='')

    policy_language_model, reward_language_model = config.get_language_models(
        params)

    onehot_encoder = config.get_one_hot_encoder(params['all_descriptions'])
    # Define the goal sampler for training
    goal_sampler = GoalSampler(policy_language_model=policy_language_model,
                               reward_language_model=reward_language_model,
                               goal_dim=policy_language_model.goal_dim,
                               one_hot_encoder=onehot_encoder,
                               params=params)

    reward_function = config.get_reward_function(goal_sampler, params)
    if params['conditions']['reward_function'] == 'learned_lstm':
        reward_function.restore_from_checkpoint(
            PATH +
            'reward_checkpoints/reward_func_checkpoint_{}'.format(EPOCH))
    policy_language_model.set_reward_function(reward_function)
    if reward_language_model is not None:
        reward_language_model.set_reward_function(reward_function)
    goal_sampler.update_discovered_goals(params['all_descriptions'],
                                         episode_count=0,
                                         epoch=0)

    # Define learning algorithm
    policy = config.configure_learning_algo(reward_function=reward_function,
                                            goal_sampler=goal_sampler,
                                            params=params)

    policy.load_params(POLICY_FILE)

    evaluation_worker = RolloutWorker(make_env=params['make_env'],
                                      policy=policy,
                                      reward_function=reward_function,
                                      params=params,
                                      render=render,
                                      **params['evaluation_rollout_params'])
    evaluation_worker.seed(seed)

    # Run evaluation.
    evaluation_worker.clear_history()

    env_params = evaluation_worker.env.unwrapped.params
    train_descriptions, test_descriptions, _ = generate_all_descriptions(
        env_params)
    train_descriptions = list(train_descriptions)
    np.random.shuffle(list(test_descriptions))
    np.random.shuffle(train_descriptions)
    successes_test_descr = []
    for d in test_descriptions:
        successes_test_descr.append([])
        print(d)
        for i in range(n_test_rollouts):
            goal_str = [d]
            goal_encoding = [policy_language_model.encode(goal_str[0])]
            goal_id = [0]
            ep = evaluation_worker.generate_rollouts(
                exploit=True,
                imagined=False,
                goals_str=goal_str,
                goals_encodings=goal_encoding,
                goals_ids=goal_id)
            out = get_reward_from_state(ep[0]['obs'][-1], goal_str[0],
                                        env_params)
            successes_test_descr[-1].append(out == 1)
        print('Success rate {}: {}'.format(d,
                                           np.mean(successes_test_descr[-1])))
    print('Global success rate: {}'.format(np.mean(successes_test_descr)))
Ejemplo n.º 4
0
def plot_generalization(path, freq=10):
    first = True
    trial_folder = path
    for trial in os.listdir(path):
        print(trial)
        # if os.path.exists(path + '/' + trial + '/adaptation_success_rates_food.txt'):

        trial_folder = path + '/' + trial + '/'
        policy_folder = trial_folder + 'policy_checkpoints/'
        params_file = trial_folder + 'params.json'

        data = pd.read_csv(os.path.join(trial_folder, 'progress.csv'))
        all_epochs = data['epoch']
        all_episodes = data['episode']
        epochs = []
        episodes = []
        for epoch, episode in zip(all_epochs, all_episodes):
            if epoch % freq == 0:
                epochs.append(epoch)
                episodes.append(int(episode))

        # Load params
        with open(params_file) as json_file:
            params = json.load(json_file)
        seed = params['experiment_params']['seed']
        set_global_seeds(seed)

        goal_invention = int(
            params['conditions']['goal_invention'].split('_')[-1])
        env_id = params['conditions']['env_id']
        if 'plant' not in env_id:
            test_plants = plants.copy() + ['plant', 'living_thing']
            test_plants.remove('flower')
            test_descriptions = [
                'Grow {} {}'.format(c, p) for c in thing_colors + ['any']
                for p in test_plants
            ]
        else:
            if 'big' in env_id:
                test_plants = [
                    'algae', 'bonsai', 'tree', 'bush', 'plant', 'living_thing'
                ]
            else:
                test_plants = ['tree', 'bush', 'plant', 'living_thing']
            test_descriptions = [
                'Grow {} {}'.format(c, p) for c in thing_colors + ['any']
                for p in test_plants
            ]

        first_epoch = True

        rank = 0
        if first:
            if not RENDER:
                env = 'PlaygroundNavigation-v1'
            else:
                env = 'PlaygroundNavigationRender-v1'
            params, rank_seed = config.configure_everything(
                rank=rank,
                seed=seed,
                num_cpu=params['experiment_params']['n_cpus'],
                env=env,
                trial_id=0,
                n_epochs=10,
                reward_function=params['conditions']['reward_function'],
                curriculum_replay_target=params['conditions']
                ['curriculum_replay_target'],
                curriculum_target=params['conditions']['curriculum_target'],
                policy_encoding=params['conditions']['policy_encoding'],
                bias_buffer=params['conditions']['bias_buffer'],
                feedback_strategy=params['conditions']['feedback_strategy'],
                goal_sampling_policy=params['conditions']
                ['goal_sampling_policy'],
                policy_architecture=params['conditions']
                ['policy_architecture'],
                goal_invention=params['conditions']['goal_invention'],
                reward_checkpoint=params['conditions']['reward_checkpoint'],
                rl_positive_ratio=params['conditions']['rl_positive_ratio'],
                p_partner_availability=params['conditions']
                ['p_social_partner_availability'],
                power_rarity=2,
                git_commit='')

            policy_language_model, reward_language_model = config.get_language_models(
                params)

            onehot_encoder = config.get_one_hot_encoder()
            goal_sampler = GoalSampler(
                policy_language_model=policy_language_model,
                reward_language_model=reward_language_model,
                goal_dim=policy_language_model.goal_dim,
                one_hot_encoder=onehot_encoder,
                **params['goal_sampler'],
                params=params)

            reward_function = config.get_reward_function(goal_sampler, params)

        else:

            def make_env():
                return gym.make(params['conditions']['env_name'])

            params['make_env'] = make_env

        # Load policy.
        success_rates = np.zeros([len(test_descriptions), len(epochs), 2])
        for ind_ep, epoch in enumerate(epochs):
            print('\n\n\t\t EPOCH', epoch)
            if first:
                first = False
                reuse = False
            else:
                reuse = True

            if params['conditions']['reward_function'] == 'learned_lstm':
                reward_function.restore_from_checkpoint(
                    trial_folder +
                    'reward_checkpoints/reward_func_checkpoint_{}'.format(
                        epoch))
            policy_language_model.set_reward_function(reward_function)
            if reward_language_model is not None:
                reward_language_model.set_reward_function(reward_function)

            goal_sampler.update_discovered_goals(params['all_descriptions'],
                                                 episode_count=0,
                                                 epoch=0)

            with tf.variable_scope(tf.get_variable_scope(), reuse=reuse):
                with open(policy_folder + 'policy_{}.pkl'.format(epoch),
                          'rb') as f:
                    policy = pickle.load(f)

            evaluation_worker = RolloutWorker(
                make_env=params['make_env'],
                policy=policy,
                reward_function=reward_function,
                params=params,
                render=RENDER,
                **params['evaluation_rollout_params'])
            evaluation_worker.seed(seed)

            # Run evaluation.
            evaluation_worker.clear_history()
            successes_per_descr = np.zeros([len(test_descriptions), 2])
            for ind_inst, instruction in enumerate(test_descriptions):
                # instruction = 'Grasp any fly'
                success_instruction = []
                goal_str = [instruction]
                goal_encoding = [policy_language_model.encode(goal_str[0])]
                goal_id = [0]

                for i in range(N_REPET):
                    ep = evaluation_worker.generate_rollouts(
                        exploit=True,
                        imagined=False,
                        goals_str=goal_str,
                        goals_encodings=goal_encoding,
                        goals_ids=goal_id)
                    for t in range(ep[0]['obs'].shape[0]):
                        metric_food = food_on_furniture(
                            ep[0]['obs'][t], goal_str[0])
                        if metric_food:
                            # print('\n\n Touched food')
                            break
                    for t in range(ep[0]['obs'].shape[0]):
                        metric_water = water_on_furniture(
                            ep[0]['obs'][t], goal_str[0])
                        if metric_water:
                            # print('\n \n Touched water')
                            break
                    success_instruction.append([metric_food, metric_water])
                success_instruction = np.array(success_instruction)
                success_rate_inst = np.mean(success_instruction, axis=0)
                successes_per_descr[ind_inst] = success_rate_inst
                print('\t Success rate {}: food {}, water {}'.format(
                    goal_str[0], success_rate_inst[0], success_rate_inst[1]))
                success_rates[ind_inst, ind_ep, :] = success_rate_inst
            np.savetxt(trial_folder + 'adaptation_success_rates_water.txt',
                       success_rates[:, :, 1])
            np.savetxt(trial_folder + 'adaptation_success_rates_food.txt',
                       success_rates[:, :, 0])

        # success_rates = np.zeros([len(test_descriptions), len(epochs), 2])
        # success_rates[:, :, 0] = np.loadtxt(trial_folder + 'adaptation_success_rates_food.txt')
        # success_rates[:, :, 1] = np.loadtxt(trial_folder + 'adaptation_success_rates_water.txt')

        line, err_min, err_max = get_stat_func(LINE, ERR)
        # plot
        fig = plt.figure(figsize=(22, 15), frameon=False)
        ax = fig.add_subplot(111)
        ax.spines['top'].set_linewidth(6)
        ax.spines['right'].set_linewidth(6)
        ax.spines['bottom'].set_linewidth(6)
        ax.spines['left'].set_linewidth(6)
        ax.tick_params(width=4, direction='in', length=10, labelsize='small')
        for i in range(2):
            plt.plot(np.array(episodes) / 1000,
                     line(success_rates)[:, i],
                     linewidth=10,
                     color=colors[i])
            plt.fill_between(np.array(episodes) / 1000,
                             err_min(success_rates)[:, i],
                             err_max(success_rates)[:, i],
                             color=colors[i],
                             alpha=0.2)
        # plt.vlines(goal_invention * 0.6, ymin=0, ymax=1, linestyles='--', color='k', linewidth=5)
        leg = plt.legend(['food', 'water'], frameon=False)
        lab = plt.xlabel('Episodes (x$10^3$)')
        plt.ylim([-0.01, 1.01])
        plt.yticks([0.25, 0.50, 0.75, 1])
        lab2 = plt.ylabel('Average success rate')
        plt.savefig(os.path.join(trial_folder, 'adaptation_success_rates.pdf'),
                    bbox_extra_artists=(lab, lab2, leg),
                    bbox_inches='tight',
                    dpi=50)  # add leg
Ejemplo n.º 5
0
def run_generalization_study(path, freq=10):
    first = True

    for t_id, trial in enumerate(os.listdir(path)):
        print(trial)
        t_init = time.time()
        trial_folder = path + '/' + trial + '/'
        policy_folder = trial_folder + 'policy_checkpoints/'
        params_file = trial_folder + 'params.json'

        data = pd.read_csv(os.path.join(trial_folder, 'progress.csv'))
        all_epochs = data['epoch']
        all_episodes = data['episode']
        epochs = []
        episodes = []
        for epoch, episode in zip(all_epochs, all_episodes):
            if epoch % freq == 0:
                epochs.append(epoch)
                episodes.append(int(episode))

        # Load params
        with open(params_file) as json_file:
            params = json.load(json_file)
        seed = params['experiment_params']['seed']
        set_global_seeds(seed)

        goal_invention = int(params['conditions']['goal_invention'].split('_')[-1])
        test_descriptions = params['test_descriptions']

        rank = 0
        if first:
            if not RENDER:
                env = 'PlaygroundNavigation-v1'
            else:
                env = 'PlaygroundNavigationRender-v1'
            params, rank_seed = config.configure_everything(rank=rank,
                                                            seed=seed,
                                                            num_cpu=params['experiment_params']['n_cpus'],
                                                            env=env,
                                                            trial_id=0,
                                                            n_epochs=10,
                                                            reward_function=params['conditions']['reward_function'],
                                                            policy_encoding=params['conditions']['policy_encoding'],
                                                            bias_buffer=params['conditions']['bias_buffer'],
                                                            feedback_strategy=params['conditions']['feedback_strategy'],
                                                            policy_architecture=params['conditions']['policy_architecture'],
                                                            goal_invention=params['conditions']['goal_invention'],
                                                            reward_checkpoint=params['conditions']['reward_checkpoint'],
                                                            rl_positive_ratio=params['conditions']['rl_positive_ratio'],
                                                            p_partner_availability=params['conditions']['p_social_partner_availability'],
                                                            git_commit='')

            policy_language_model, reward_language_model = config.get_language_models(params)
            onehot_encoder = config.get_one_hot_encoder()
            goal_sampler = GoalSampler(policy_language_model=policy_language_model,
                                       reward_language_model=reward_language_model,
                                       goal_dim=policy_language_model.goal_dim,
                                       one_hot_encoder=onehot_encoder,
                                       **params['goal_sampler'],
                                       params=params)


            reward_function = config.get_reward_function(goal_sampler, params)
        else:
            def make_env():
                return gym.make(params['conditions']['env_name'])

            params['make_env'] = make_env
        loaded = False
        success_rates = np.zeros([len(test_descriptions), len(epochs)])
        if params['conditions']['reward_function'] == 'pretrained':
            reward_function.load_params(trial_folder + 'params_reward')
        if not loaded:
            # Load policy.
            t_init = time.time()

            for ind_ep, epoch in enumerate(epochs):
                print(time.time() - t_init)
                t_init = time.time()

                print('\n\n\t\t EPOCH', epoch)
                if first:
                    first = False
                    reuse = False
                else:
                    reuse = True

                if params['conditions']['reward_function'] == 'learned_lstm':
                    reward_function.restore_from_checkpoint(trial_folder + 'reward_checkpoints/reward_func_checkpoint_{}'.format(epoch))

                policy_language_model.set_reward_function(reward_function)
                if reward_language_model is not None:
                    reward_language_model.set_reward_function(reward_function)

                goal_sampler.update_discovered_goals(params['all_descriptions'], episode_count=0, epoch=0)

                with tf.variable_scope(tf.get_variable_scope(), reuse=reuse):
                    with open(policy_folder + 'policy_{}.pkl'.format(epoch), 'rb') as f:
                        policy = pickle.load(f)

                evaluation_worker = RolloutWorker(make_env=params['make_env'],
                                                  policy=policy,
                                                  reward_function=reward_function,
                                                  params=params,
                                                  render=RENDER,
                                                  **params['evaluation_rollout_params'])
                evaluation_worker.seed(seed)

                # Run evaluation.
                evaluation_worker.clear_history()
                successes_per_descr = np.zeros([len(test_descriptions)])
                for ind_inst, instruction in enumerate(test_descriptions):
                    # instruction = 'Grasp any fly'
                    success_instruction = []
                    goal_str = [instruction]
                    goal_encoding = [policy_language_model.encode(goal_str[0])]
                    goal_id = [0]
                    for i in range(N_REPET):
                        ep = evaluation_worker.generate_rollouts(exploit=True,
                                                                 imagined=False,
                                                                 goals_str=goal_str,
                                                                 goals_encodings=goal_encoding,
                                                                 goals_ids=goal_id)
                        success = get_reward_from_state(state=ep[0]['obs'][-1], goal=instruction)
                        success_instruction.append(success)
                    success_rate_inst = np.mean(success_instruction)
                    successes_per_descr[ind_inst] = success_rate_inst
                    print('\t Success rate {}: {}'.format(goal_str[0], success_rate_inst))
                    success_rates[ind_inst, ind_ep] = success_rate_inst
                np.savetxt(trial_folder + 'generalization_success_rates.txt', success_rates)