Beispiel #1
0
def analyze_agent_kl(exp_key):
    """
    Evaluates the agent KL post-hoc for a given experiment.

    Args:
        exp_key (str): the experiment ID
    """
    # load the experiment
    comet_api = comet_ml.API(api_key=LOADING_API_KEY)
    experiment = comet_api.get_experiment(project_name=PROJECT_NAME,
                                          workspace=WORKSPACE,
                                          experiment=exp_key)

    # create the environment
    param_summary = experiment.get_parameters_summary()
    env_name = [a for a in param_summary if a['name'] == 'env'][0]['valueCurrent']
    env = create_env(env_name)

    # create the agent
    asset_list = experiment.get_asset_list()
    agent_config_asset_list = [a for a in asset_list if 'agent_args' in a['fileName']]
    agent_args = None
    if len(agent_config_asset_list) > 0:
        # if we've saved the agent config dict, load it
        agent_args = experiment.get_asset(agent_config_asset_list[0]['assetId'])
        agent_args = json.loads(agent_args)
        agent_args = agent_args if 'opt_type' in agent_args['inference_optimizer_args'] else None
    agent = create_agent(env, agent_args=agent_args)[0]

    # get the list of checkpoint timesteps
    ckpt_asset_list = [a for a in asset_list if 'ckpt' in a['fileName']]
    ckpt_asset_names = [a['fileName'] for a in ckpt_asset_list]
    ckpt_timesteps = [int(s.split('ckpt_step_')[1].split('.ckpt')[0]) for s in ckpt_asset_names]
    ckpt_timesteps = list(np.sort(ckpt_timesteps)[::CKPT_SUBSAMPLE])

    agent_kls = []

    # initial episode using random init
    prev_episode, _, _ = collect_episode(env, agent)

    for ckpt_ind, ckpt_timestep in enumerate(ckpt_timesteps):
        # load the checkpoint
        print('Evaluating checkpoint ' + str(ckpt_ind + 1) + ' of ' + str(len(ckpt_timesteps)))
        load_checkpoint(agent, exp_key, ckpt_timestep)

        # evaluate agent KL
        print(' Evaluating agent KL...')
        agent_kls.append(estimate_agent_kl(env, agent, prev_episode))
        print(' Done.')

        # collect an episode
        print(' Collecting episode...')
        prev_episode, _, _ = collect_episode(env, agent)
        print(' Done.')

    return {'steps': ckpt_timesteps, 'agent_kl': np.array(agent_kls)}
Beispiel #2
0
                    type=lambda x: bool(strtobool(x)),
                    help='whether or not to log/plot with comet')
# other arguments here
args = parser.parse_args()

if args.seed is not None:
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.device_id is not None and torch.cuda.is_available():
        torch.cuda.manual_seed_all(args.seed)

# create the environment
env = create_env(args.env, args.seed)

# create the agent
agent, agent_args = create_agent(env, args.device_id)

# create the data buffer
buffer = Buffer(batch_size=args.batch_size, seq_len=args.train_seq_len)

# create the optimizer
optimizer = Optimizer(agent,
                      optimizer=args.optimizer,
                      lr=args.lr,
                      norm_grad=args.grad_norm,
                      weight_decay=args.weight_decay,
                      value_tau=args.value_tau,
                      policy_tau=args.policy_tau,
                      value_update=args.value_update,
                      policy_update=args.policy_update)
Beispiel #3
0
def compare_policies(exp_key1, exp_key2, write_result=True):
    """
    Compares the policies of two agents at the end of training.

    Args:
        exp_key1 (str):
        exp_key2 (str):
        write_result (bool)
    """
    # load the experiments
    comet_api = comet_ml.API(api_key=LOADING_API_KEY)
    exp1 = comet_api.get_experiment(project_name=PROJECT_NAME,
                                    workspace=WORKSPACE,
                                    experiment=exp_key1)
    exp2 = comet_api.get_experiment(project_name=PROJECT_NAME,
                                    workspace=WORKSPACE,
                                    experiment=exp_key2)

    # create the environment
    param_summary = exp1.get_parameters_summary()
    env_name = [a for a in param_summary if a['name'] == 'env'][0]['valueCurrent']
    env1 = create_env(env_name)
    env2 = create_env(env_name)

    # create the agents
    asset_list = exp1.get_asset_list()
    agent_config_asset_list = [a for a in asset_list if 'agent_args' in a['fileName']]
    agent_args = None
    if len(agent_config_asset_list) > 0:
        # if we've saved the agent config dict, load it
        agent_args = exp1.get_asset(agent_config_asset_list[0]['assetId'])
        agent_args = json.loads(agent_args)
        agent_args = agent_args if 'opt_type' in agent_args['inference_optimizer_args'] else None
    agent1 = create_agent(env1, agent_args=agent_args)[0]
    load_checkpoint(agent1, exp_key1)

    asset_list = exp2.get_asset_list()
    agent_config_asset_list = [a for a in asset_list if 'agent_args' in a['fileName']]
    agent_args = None
    if len(agent_config_asset_list) > 0:
        # if we've saved the agent config dict, load it
        agent_args = exp2.get_asset(agent_config_asset_list[0]['assetId'])
        agent_args = json.loads(agent_args)
        agent_args = agent_args if 'opt_type' in agent_args['inference_optimizer_args'] else None
    agent2 = create_agent(env1, agent_args=agent_args)[0]
    load_checkpoint(agent2, exp_key2)

    # evaluate the KL between policies
    kl12 = []
    kl21 = []
    agent1.reset(); agent1.eval()
    agent2.reset(); agent2.eval()

    state1 = env1.reset()
    state2 = env2.reset()

    for state_ind in range(N_STATES):
        # perform policy optimization on state1
        action1 = agent1.act(state1)
        agent2.act(state1)
        kl = kl_divergence(agent1.approx_post, agent2.approx_post).sum().detach().item()
        kl12.append(kl)

        agent1.reset(); agent1.eval()
        agent2.reset(); agent2.eval()

        # perform policy optimization on state2
        agent1.act(state2)
        action2 = agent2.act(state2)
        kl = kl_divergence(agent2.approx_post, agent1.approx_post).sum().detach().item()
        kl21.append(kl)

        # step the environments
        state1, _, done1, _ = env1.step(action1)
        state2, _, done2, _ = env2.step(action2)

        if done1:
            agent1.reset(); agent1.eval()
            state1 = env1.reset()
            done1 = False
        if done2:
            agent2.reset(); agent2.eval()
            state2 = env2.reset()
            done2 = False

    kls = {'kl12': kl12,
           'kl21': kl21}

    if write_result:
        pickle.dump(kls, open('policy_kl_' + exp_key1 + '_' + exp_key2 + '.p', 'wb'))

    return kls
Beispiel #4
0
def compare_goal_optimizers(model_exp_key,
                            opt_exp_key=None,
                            write_results=True,
                            stochastic_model=False):
    """
    Optimize random goal states using a model-based estimator.
    Train the policy optimizer online. Compare with other optimizers.
    Note: tailored to HalfCheetah-v2 environment currently.

    Args:
        model_exp_key (str): model-based experiment key
        opt_exp_key (str): optimizer experiment key. If None, trains from scratch
        write_results (bool): whether to pickle results directly
        stochastic_model (bool): whether to sample states or use mean estimate
        train_model (bool) whether to train the model online
    """

    ## MODEL
    # load the model experiment
    comet_api = comet_ml.API(api_key=LOADING_API_KEY)
    experiment = comet_api.get_experiment(project_name=PROJECT_NAME,
                                          workspace=WORKSPACE,
                                          experiment=model_exp_key)

    # create the environment
    param_summary = experiment.get_parameters_summary()
    env_name = [a for a in param_summary
                if a['name'] == 'env'][0]['valueCurrent']
    env = create_env(env_name)
    # create a synchronous env to parallelize training
    sync_env = SynchronousEnv(env, BATCH_SIZE)

    # create the agent
    asset_list = experiment.get_asset_list()
    agent_config_asset_list = [
        a for a in asset_list if 'agent_args' in a['fileName']
    ]
    agent_args = None
    if len(agent_config_asset_list) > 0:
        # if we've saved the agent config dict, load it
        agent_args = experiment.get_asset(
            agent_config_asset_list[0]['assetId'])
        agent_args = json.loads(agent_args)
        agent_args = agent_args if 'opt_type' in agent_args[
            'inference_optimizer_args'] else None
    agent = create_agent(env, agent_args=agent_args)[0]

    # also, load the most recent episode to sample goal states
    asset_times = [
        asset['createdAt'] for asset in asset_list
        if 'state' in asset['fileName']
    ]
    state_asset = [
        a for a in asset_list if a['createdAt'] == max(asset_times)
    ][0]
    episode_states = json.loads(experiment.get_asset(state_asset['assetId']))

    # load the checkpoint
    load_checkpoint(agent, model_exp_key)

    if stochastic_model:
        agent.q_value_estimator.state_variable.cond_likelihood.stochastic = True

    # swap out the value estimator for goal-based estimator
    gb_estimator = GoalBasedQEstimator()
    # copy over the dynamics model
    gb_estimator.state_likelihood_model = agent.q_value_estimator.state_likelihood_model
    gb_estimator.state_variable = agent.q_value_estimator.state_variable
    # set the estimator
    agent.q_value_estimator = gb_estimator
    agent.q_value_estimator.set_goal_std(GOAL_STD)
    # agent.alphas['pi'] = 0.

    total_results = {
        'grad_based': None,
        'cem': None,
        'it_am': None,
        'goal_cond': None
    }

    goals = []
    print('Sampling goals...')
    for step_ind in range(N_TOTAL_STEPS):
        new_goal_states = np.stack([
            episode_states[np.random.randint(0, 25)] for _ in range(BATCH_SIZE)
        ])
        # goal_state = episode_states[goal_ind]
        new_goal_states = torch.from_numpy(new_goal_states).float().view(
            BATCH_SIZE, -1)
        new_goal_states[:, 8:] *= 0.
        if step_ind == 0:
            goal_state = new_goal_states
        else:
            # randomly change the goal state with some small probability
            flips = (torch.rand(BATCH_SIZE, 1) <
                     GOAL_FLIP_PROB).float().repeat(1,
                                                    new_goal_states.shape[-1])
            goal_state = (1 - flips) * goal_state + flips * new_goal_states
        goals.append(goal_state)

    print('Evaluating gradient-based agent...')
    agent.inference_optimizer = GradientBasedInference(lr=1e-3, n_inf_iters=50)
    grad_based_results = collect_goal_optimization(agent, sync_env, goals)
    total_results['grad_based'] = grad_based_results
    print('Done.')

    # print('Evaluating CEM agent...')
    # agent.inference_optimizer = CEMInference(lr=1e-3, n_top_samples=10, n_inf_iters=50)
    # agent.n_action_samples = 100
    # cem_results = collect_goal_optimization(agent, sync_env, goals)
    # total_results['cem'] = cem_results
    # print('Done.')

    print('Evaluating iterative amortized agent...')
    # create an iterative amortized optimizer
    inputs = ['params', 'grads', 'state']
    n_input = 24
    if 'state' in inputs:
        n_input += 17
    network_args = {
        'type': 'recurrent',
        'n_layers': 2,
        'inputs': inputs,
        'n_units': 512,
        'connectivity': 'highway',
        'n_input': n_input
    }
    agent.inference_optimizer = IterativeInferenceModel(
        network_args=network_args, n_inf_iters=10)
    for m in agent.approx_post.models:
        agent.approx_post.models[m] = FullyConnectedLayer(512, 6)
        agent.approx_post.gates[m] = FullyConnectedLayer(
            512, 6, non_linearity='sigmoid')
        agent.approx_post.update = 'iterative'
    # create a parameter optimizer for the inference model
    inference_parameters = [_ for _ in agent.inference_optimizer.parameters()
                            ] + [_ for _ in agent.approx_post.parameters()]
    inf_optim = optim.Adam(inference_parameters, lr=3e-4)
    it_am_results = collect_goal_optimization(agent,
                                              sync_env,
                                              goals,
                                              inf_optim=inf_optim)
    total_results['it_am'] = it_am_results
    print('Done.')

    print('Evaluating goal-conditioned agent...')
    # create a direct, goal-conditioned network
    network_args = {
        'type': 'fully_connected',
        'n_layers': 2,
        'inputs': ['state', 'goal'],
        'n_units': 512,
        'connectivity': 'highway',
        'n_input': 17 + 17
    }
    agent.inference_optimizer = DirectGoalInferenceModel(
        network_args=network_args)
    for m in agent.approx_post.models:
        agent.approx_post.models[m] = FullyConnectedLayer(512, 6)
        agent.approx_post.update = 'direct'
    # create a parameter optimizer for the inference model
    inference_parameters = [_ for _ in agent.inference_optimizer.parameters()
                            ] + [_ for _ in agent.approx_post.parameters()]
    inf_optim = optim.Adam(inference_parameters, lr=3e-4)
    goal_cond_results = collect_goal_optimization(agent,
                                                  sync_env,
                                                  goals,
                                                  inf_optim=inf_optim)
    total_results['goal_cond'] = goal_cond_results
    print('Done.')

    if write_results:
        pickle.dump(total_results,
                    open('comp_goal_opt_' + model_exp_key + '.p', 'wb'))

    return total_results
Beispiel #5
0
def goal_optimization(model_exp_key, opt_exp_key=None, write_results=True):
    """
    Optimize random goal states using a model-based estimator.
    Note: tailored to HalfCheetah-v2 environment currently.

    Args:
        model_exp_key (str): model-based experiment key
        opt_exp_key (str): optimizer experiment key
        write_results (bool): whether to pickle results directly
    """

    # load the experiment
    comet_api = comet_ml.API(api_key=LOADING_API_KEY)
    experiment = comet_api.get_experiment(project_name=PROJECT_NAME,
                                          workspace=WORKSPACE,
                                          experiment=model_exp_key)

    # create the environment
    param_summary = experiment.get_parameters_summary()
    env_name = [a for a in param_summary
                if a['name'] == 'env'][0]['valueCurrent']
    env = create_env(env_name)

    # create the agent
    asset_list = experiment.get_asset_list()
    agent_config_asset_list = [
        a for a in asset_list if 'agent_args' in a['fileName']
    ]
    agent_args = None
    if len(agent_config_asset_list) > 0:
        # if we've saved the agent config dict, load it
        agent_args = experiment.get_asset(
            agent_config_asset_list[0]['assetId'])
        agent_args = json.loads(agent_args)
        agent_args = agent_args if 'opt_type' in agent_args[
            'inference_optimizer_args'] else None
    agent = create_agent(env, agent_args=agent_args)[0]

    # also, load the most recent episode to sample goal states
    asset_times = [
        asset['createdAt'] for asset in asset_list
        if 'state' in asset['fileName']
    ]
    state_asset = [
        a for a in asset_list if a['createdAt'] == max(asset_times)
    ][0]
    episode_states = json.loads(experiment.get_asset(state_asset['assetId']))

    # load the checkpoint
    load_checkpoint(agent, model_exp_key)

    # load the optimizer
    if opt_exp_key is not None:
        # load the experiment
        comet_api = comet_ml.API(api_key=LOADING_API_KEY)
        opt_experiment = comet_api.get_experiment(project_name=PROJECT_NAME,
                                                  workspace=WORKSPACE,
                                                  experiment=opt_exp_key)

        # create the agent
        asset_list = opt_experiment.get_asset_list()
        agent_config_asset_list = [
            a for a in asset_list if 'agent_args' in a['fileName']
        ]
        agent_args = None
        if len(agent_config_asset_list) > 0:
            # if we've saved the agent config dict, load it
            agent_args = opt_experiment.get_asset(
                agent_config_asset_list[0]['assetId'])
            agent_args = json.loads(agent_args)
            agent_args = agent_args if 'opt_type' in agent_args[
                'inference_optimizer_args'] else None
        opt_agent = create_agent(env, agent_args=agent_args)[0]

        # load the checkpoint
        load_checkpoint(opt_agent, opt_exp_key)

        agent.inference_optimizer = opt_agent.inference_optimizer
        agent.inference_optimizer.n_inf_iters = 20
    else:
        # create a gradient-based optimizer
        agent.inference_optimizer = GradientBasedInference(lr=1e-3,
                                                           n_inf_iters=50)

    # swap out the value estimator for goal-based estimator
    gb_estimator = GoalBasedQEstimator()
    # copy over the dynamics model
    gb_estimator.state_likelihood_model = agent.q_value_estimator.state_likelihood_model
    gb_estimator.state_variable = agent.q_value_estimator.state_variable
    # set the estimator
    agent.q_value_estimator = gb_estimator
    agent.q_value_estimator.set_goal_std(GOAL_STD)
    # agent.alphas['pi'] = 0.

    # optimize goal states
    goal_states = []
    traj_states = []
    env_states = {'qpos': [], 'qvel': []}
    actions = []
    inf_objectives = []

    agent.reset()
    agent.eval()
    state = env.reset()
    if RENDER:
        env.render()
    goal_state = None

    # goal_ind = 0

    print('Collecting goal-optimization episode...')
    for step_ind in range(N_TOTAL_STEPS):
        print('STEP: ' + str(step_ind))
        if step_ind % GOAL_INTERVAL == 0:
            goal_state = episode_states[np.random.randint(0, 25)]
            # goal_state = episode_states[goal_ind]
            goal_state = torch.from_numpy(np.array(goal_state)).float().view(
                1, -1)
            goal_state[:, 8:] *= 0.
            if not TRAJECTORY_FOLLOW:
                agent.q_value_estimator.set_goal_state(goal_state)
            # goal_ind += 1
        if TRAJECTORY_FOLLOW:
            # define a sub-goal between current state and goal state
            delta_state = goal_state - state
            traj_state = state + 0.1 * delta_state
            agent.q_value_estimator.set_goal_state(traj_state)
            traj_states.append(traj_state)
        else:
            traj_states.append(goal_states)
        goal_states.append(goal_state)
        env_states['qpos'].append(copy.deepcopy(env.sim.data.qpos))
        env_states['qvel'].append(copy.deepcopy(env.sim.data.qvel))
        action = agent.act(state, eval=True)
        state, _, _, _ = env.step(action)
        inf_objectives.append(agent.inference_optimizer.estimated_objectives)
        # import ipdb; ipdb.set_trace()
        agent.inference_optimizer.reset(1)
        if RENDER:
            env.render()
        actions.append(action)
    print('Done.')

    # save the results
    results = {
        'goal_states': goal_states,
        'traj_states': traj_states,
        'env_states': env_states,
        'actions': actions
    }

    if write_results:
        pickle.dump(results, open('goal_opt_' + model_exp_key + '.p', 'wb'))

    return results
Beispiel #6
0
def goal_optimization_training(model_exp_key,
                               opt_exp_key=None,
                               write_results=True,
                               stochastic_model=False,
                               train_model=False):
    """
    Optimize random goal states using a model-based estimator.
    Train the policy optimizer online.
    Note: tailored to HalfCheetah-v2 environment currently.

    Args:
        model_exp_key (str): model-based experiment key
        opt_exp_key (str): optimizer experiment key. If None, trains from scratch
        write_results (bool): whether to pickle results directly
        stochastic_model (bool): whether to sample states or use mean estimate
        train_model (bool) whether to train the model online
    """

    # load the experiment
    comet_api = comet_ml.API(api_key=LOADING_API_KEY)
    experiment = comet_api.get_experiment(project_name=PROJECT_NAME,
                                          workspace=WORKSPACE,
                                          experiment=model_exp_key)

    # create the environment
    param_summary = experiment.get_parameters_summary()
    env_name = [a for a in param_summary
                if a['name'] == 'env'][0]['valueCurrent']
    env = create_env(env_name)
    # create a synchronous env to parallelize training
    sync_env = SynchronousEnv(env, BATCH_SIZE)

    # create the agent
    asset_list = experiment.get_asset_list()
    agent_config_asset_list = [
        a for a in asset_list if 'agent_args' in a['fileName']
    ]
    agent_args = None
    if len(agent_config_asset_list) > 0:
        # if we've saved the agent config dict, load it
        agent_args = experiment.get_asset(
            agent_config_asset_list[0]['assetId'])
        agent_args = json.loads(agent_args)
        agent_args = agent_args if 'opt_type' in agent_args[
            'inference_optimizer_args'] else None
    agent = create_agent(env, agent_args=agent_args)[0]

    # also, load the most recent episode to sample goal states
    asset_times = [
        asset['createdAt'] for asset in asset_list
        if 'state' in asset['fileName']
    ]
    state_asset = [
        a for a in asset_list if a['createdAt'] == max(asset_times)
    ][0]
    episode_states = json.loads(experiment.get_asset(state_asset['assetId']))

    # load the checkpoint
    load_checkpoint(agent, model_exp_key)

    if stochastic_model:
        agent.q_value_estimator.state_variable.cond_likelihood.stochastic = True

    # load the optimizer
    if opt_exp_key is not None:
        # load the experiment
        comet_api = comet_ml.API(api_key=LOADING_API_KEY)
        opt_experiment = comet_api.get_experiment(project_name=PROJECT_NAME,
                                                  workspace=WORKSPACE,
                                                  experiment=opt_exp_key)

        # create the agent
        asset_list = opt_experiment.get_asset_list()
        agent_config_asset_list = [
            a for a in asset_list if 'agent_args' in a['fileName']
        ]
        agent_args = None
        if len(agent_config_asset_list) > 0:
            # if we've saved the agent config dict, load it
            agent_args = opt_experiment.get_asset(
                agent_config_asset_list[0]['assetId'])
            agent_args = json.loads(agent_args)
            agent_args = agent_args if 'opt_type' in agent_args[
                'inference_optimizer_args'] else None
        opt_agent = create_agent(env, agent_args=agent_args)[0]

        # load the checkpoint
        load_checkpoint(opt_agent, opt_exp_key)

        agent.inference_optimizer = opt_agent.inference_optimizer
        agent.inference_optimizer.n_inf_iters = 10
    else:
        # create an iterative amortized optimizer
        n_input = 12
        if ENCODING_TYPE == 'grads':
            inputs = ['params', 'grads']
            n_input += 12
        elif ENCODING_TYPE == 'errors':
            inputs = ['params', 'errors']
            n_input += (17 + 17 + 6)
        n_units = 512
        # network_args = {'type': 'fully_connected',
        #                 'n_layers': 2,
        #                 'inputs': inputs,
        #                 'n_units': n_units,
        #                 'connectivity': 'highway',
        #                 'batch_norm': False,
        #                 'non_linearity': 'elu',
        #                 'dropout': None,
        #                 'separate_networks': False,
        #                 'n_input': n_input}
        network_args = {
            'type': 'recurrent',
            'n_layers': 2,
            'inputs': inputs,
            'n_units': n_units,
            'connectivity': 'highway',
            'batch_norm': False,
            'dropout': None,
            'separate_networks': False,
            'n_input': n_input
        }
        agent.inference_optimizer = IterativeInferenceModel(
            network_args=network_args,
            n_inf_iters=5,
            encoding_type=ENCODING_TYPE)
        for m in agent.approx_post.models:
            agent.approx_post.models[m] = FullyConnectedLayer(n_units, 6)
            agent.approx_post.gates[m] = FullyConnectedLayer(
                n_units, 6, non_linearity='sigmoid')

    # create a parameter optimizer for the inference model
    inference_parameters = [_ for _ in agent.inference_optimizer.parameters()
                            ] + [_ for _ in agent.approx_post.parameters()]
    param_opt = optim.Adam(inference_parameters, lr=3e-4)

    # swap out the value estimator for goal-based estimator
    gb_estimator = GoalBasedQEstimator()
    # copy over the dynamics model
    gb_estimator.state_likelihood_model = agent.q_value_estimator.state_likelihood_model
    gb_estimator.state_variable = agent.q_value_estimator.state_variable
    # set the estimator
    agent.q_value_estimator = gb_estimator
    agent.q_value_estimator.set_goal_std(GOAL_STD)
    # agent.alphas['pi'] = 0.

    model_param_opt = None
    if train_model:
        # create a parameter optimizer for the inference model
        model_parameters = [
            _ for _ in
            agent.q_value_estimator.state_likelihood_model.parameters()
        ] + [_ for _ in agent.q_value_estimator.state_variable.parameters()]
        model_param_opt = optim.Adam(model_parameters, lr=3e-4)

    # optimize goal states
    goal_states = []
    traj_states = []
    env_states = {'qpos': [], 'qvel': []}
    actions = []
    inf_objectives = []
    state_log_likelihoods = []
    state_squared_errors = []
    state_locs = []
    state_scales = []
    model_cll_training = []

    agent.reset(batch_size=BATCH_SIZE)
    agent.eval()
    state = sync_env.reset()
    if RENDER:
        env.render()
    goal_state = None
    state_likelihood = None

    # goal_ind = 0

    print('Collecting goal-optimization episode...')
    for step_ind in range(N_TOTAL_STEPS):
        print('STEP: ' + str(step_ind))
        # if step_ind % GOAL_INTERVAL == 0:
        if True:
            new_goal_states = np.stack([
                episode_states[np.random.randint(0, 25)]
                for _ in range(BATCH_SIZE)
            ])
            # goal_state = episode_states[goal_ind]
            new_goal_states = torch.from_numpy(new_goal_states).float().view(
                BATCH_SIZE, -1)
            new_goal_states[:, 8:] *= 0.
            if step_ind == 0:
                goal_state = new_goal_states
            else:
                # randomly change the goal state with some small probability
                flips = (torch.rand(BATCH_SIZE, 1) <
                         GOAL_FLIP_PROB).float().repeat(
                             1, new_goal_states.shape[-1])
                goal_state = (1 - flips) * goal_state + flips * new_goal_states
            if not TRAJECTORY_FOLLOW:
                agent.q_value_estimator.set_goal_state(goal_state)
            # goal_ind += 1
        if TRAJECTORY_FOLLOW:
            # define a sub-goal between current state and goal state
            delta_state = goal_state - state
            traj_state = state + 0.1 * delta_state
            agent.q_value_estimator.set_goal_state(traj_state)
            traj_states.append(traj_state)
        else:
            traj_states.append(goal_states)
        goal_states.append(goal_state)
        qpos = np.stack(
            [copy.deepcopy(e.sim.data.qpos) for e in sync_env.envs])
        qvel = np.stack(
            [copy.deepcopy(e.sim.data.qvel) for e in sync_env.envs])
        env_states['qpos'].append(qpos)
        env_states['qvel'].append(qvel)
        action = agent.act(state, eval=True)
        state, _, _, _ = sync_env.step(action)
        inf_objectives.append(agent.inference_optimizer.estimated_objectives)

        if train_model:
            agent.q_value_estimator.generate(agent)
            cll = -agent.q_value_estimator.state_variable.cond_log_likelihood(
                state).view(-1, 1).mean()
            model_cll_training.append(cll.detach().item())
            cll.backward()
            model_param_opt.step()

        if state_likelihood is not None:
            state_ll = state_likelihood.log_prob(state)
            state_log_likelihoods.append(state_ll)
            state_squared_error = (state_likelihood.loc - state).pow(2)
            state_squared_errors.append(state_squared_error)

        state_loc = agent.collector.distributions['state']['cond_like']['loc'][
            -1]
        state_scale = agent.collector.distributions['state']['cond_like'][
            'scale'][-1]
        state_locs.append(state_loc)
        state_scales.append(state_scale)
        state_likelihood = Normal(state_loc, state_scale)

        # update the inference optimizer
        grads = [param.grad for param in inference_parameters]
        divide_gradients_by_value(grads, agent.inference_optimizer.n_inf_iters)
        divide_gradients_by_value(grads, BATCH_SIZE)
        param_opt.step()
        param_opt.zero_grad()

        agent.inference_optimizer.reset(BATCH_SIZE)
        if RENDER:
            env.render()
        actions.append(action)
    print('Done.')

    # save the results
    results = {
        'goal_states': goal_states,
        'traj_states': traj_states,
        'env_states': env_states,
        'actions': actions,
        'inf_objectives': inf_objectives,
        'state_locs': state_locs,
        'state_scales': state_scales,
        'state_log_likelihoods': state_log_likelihoods,
        'state_squared_errors': state_squared_errors,
        'model_cll_training': model_cll_training
    }

    if write_results:
        pickle.dump(results, open('goal_opt_' + model_exp_key + '.p', 'wb'))

    return results
Beispiel #7
0
def evaluate_estimator(exp_key, n_state_action, n_mc_samples, device_id=None):
    """
    Evaluates the value estimator of a cached experiment throughout learning.

    Args:
        exp_key (str): the string of the comet experiment key
        n_state_action (int): number of state action pairs to evaluate
        n_mc_samples (int): number of Monte Carlo samples to estimate
                            environment returns

    Returns dictionary containing:
                ckpt_timesteps [n_ckpts]
                value_estimates [n_ckpts, n_state_action, 1],
                direct_value_estimates [n_ckpts, n_state_action, 1]
                mc_estimates [n_ckpts, n_state_action, n_mc_samples]
    """
    # load the experiment
    comet_api = comet_ml.API(api_key=LOADING_API_KEY)
    experiment = comet_api.get_experiment(project_name=PROJECT_NAME,
                                          workspace=WORKSPACE,
                                          experiment=exp_key)

    # create the corresponding environment
    param_summary = experiment.get_parameters_summary()
    env_name = [a for a in param_summary
                if a['name'] == 'env'][0]['valueCurrent']
    env = create_env(env_name)

    # collect state-action samples using random policy
    print('Collecting ' + str(n_state_action) + ' state-action pairs...')
    sa_pairs = {'states': [], 'env_states': [], 'actions': []}
    state = env.reset()
    env_state = (copy.deepcopy(env.sim.data.qpos),
                 copy.deepcopy(env.sim.data.qvel))
    for _ in range(n_state_action):
        action = env.action_space.sample()
        next_state, reward, done, _ = env.step(action)
        sa_pairs['states'].append(state)
        sa_pairs['env_states'].append(env_state)
        sa_pairs['actions'].append(torch.from_numpy(action).view(1, -1))
        state = env.reset() if done else next_state
        env_state = (copy.deepcopy(env.sim.data.qpos),
                     copy.deepcopy(env.sim.data.qvel))
    print('Done.')

    # enumerate state-action pairs, estimating returns at each stage of learning
    asset_list = experiment.get_asset_list()
    agent_config_asset_list = [
        a for a in asset_list if 'agent_args' in a['fileName']
    ]
    agent_args = None
    if len(agent_config_asset_list) > 0:
        # if we've saved the agent config dict, load it
        agent_args = experiment.get_asset(
            agent_config_asset_list[0]['assetId'])
        agent_args = json.loads(agent_args)
        agent_args = agent_args if 'opt_type' in agent_args[
            'inference_optimizer_args'] else None
    agent = create_agent(env, agent_args=agent_args, device_id=device_id)[0]
    # get the list of checkpoint timesteps
    ckpt_asset_list = [a for a in asset_list if 'ckpt' in a['fileName']]
    ckpt_asset_names = [a['fileName'] for a in ckpt_asset_list]
    ckpt_timesteps = [
        int(s.split('ckpt_step_')[1].split('.ckpt')[0])
        for s in ckpt_asset_names
    ]

    # convert n_mc_samples to a round number of batches
    n_batches = math.ceil(n_mc_samples / ROLLOUT_BATCH_SIZE)
    n_mc_samples = ROLLOUT_BATCH_SIZE * n_batches

    # TODO: the first dimension should be divided by CKPT_SUBSAMPLE
    value_estimates = np.zeros((len(ckpt_timesteps), n_state_action, 1))
    direct_value_estimates = np.zeros((len(ckpt_timesteps), n_state_action, 1))
    mc_estimates = np.zeros(
        (len(ckpt_timesteps), n_state_action, n_mc_samples))
    # iterate over sub-sampled checkpoint timesteps, evaluating
    ckpt_timesteps = list(np.sort(ckpt_timesteps)[::CKPT_SUBSAMPLE])
    for ckpt_ind, ckpt_timestep in enumerate(ckpt_timesteps):
        # load the checkpoint
        print('Evaluating checkpoint ' + str(ckpt_ind + 1) + ' of ' +
              str(len(ckpt_timesteps)))
        load_checkpoint(agent, exp_key, ckpt_timestep)
        # get value estimate and estimate returns for the state-action pairs
        for sa_ind, (env_state, state, act) in enumerate(
                zip(sa_pairs['env_states'], sa_pairs['states'],
                    sa_pairs['actions'])):
            t_start = time.time()
            action_value_estimate = get_agent_value_estimate(agent, state, act)
            value_estimates[ckpt_ind,
                            sa_ind, :] = action_value_estimate['estimate']
            direct_value_estimates[ckpt_ind,
                                   sa_ind, :] = action_value_estimate['direct']
            returns = estimate_monte_carlo_return(env, agent, env_state, state,
                                                  act, n_batches)
            mc_estimates[ckpt_ind, sa_ind, :] = returns
            if sa_ind % 1 == 0:
                print('  Evaluated ' + str(sa_ind + 1) + ' of ' +
                      str(len(sa_pairs['states'])) + ' state-action pairs.')
                print('  Duration: ' + '{:.2f}'.format(time.time() - t_start) +
                      ' s / state-action pair.')

    # TODO: log the value estimates to comet (need to json-ify the numpy arrays)
    # prev_exp = comet_ml.ExistingExperiment(api_key=LOGGING_API_KEY,
    #                                        previous_experiment=exp_key)
    # prev_exp.log_asset_data(value_estimates, name='value_estimates')
    # prev_exp.log_asset_data(direct_value_estimates, name='direct_value_estimates')
    # prev_exp.log_asset_data(mc_estimates, name='mc_estimates')

    return {
        'ckpt_timesteps': ckpt_timesteps,
        'value_estimates': value_estimates,
        'direct_value_estimates': direct_value_estimates,
        'mc_estimates': mc_estimates
    }