def load_checkpoint(agent, checkpoint_exp_key, timestep=None): """ Loads a checkpoint from Comet. Args: agent (Agent): the agent to be loaded """ assert checkpoint_exp_key is not None, 'Checkpoint experiment key must be set.' print('Loading checkpoint from ' + checkpoint_exp_key + '...') comet_api = comet_ml.API(api_key=LOADING_API_KEY) experiment = comet_api.get_experiment(project_name=PROJECT_NAME, workspace=WORKSPACE, experiment=checkpoint_exp_key) asset_list = experiment.get_asset_list() if timestep is not None: # get the specified checkpoint file_name = 'ckpt_step_' + str(timestep) + '.ckpt' asset = [a for a in asset_list if a['fileName'] == file_name] if len(asset) == 0: raise KeyError('Checkpoint timestep not found.') asset = asset[0] else: # get most recent checkpoint asset_times = [asset['createdAt'] for asset in asset_list if 'ckpt' in asset['fileName']] asset = [a for a in asset_list if a['createdAt'] == max(asset_times)][0] print('Checkpoint Name:', asset['fileName']) ckpt = experiment.get_asset(asset['assetId']) state_dict = torch.load(io.BytesIO(ckpt)) agent.load_state_dict(state_dict) print('Done.')
def analyze_agent_kl(exp_key): """ Evaluates the agent KL post-hoc for a given experiment. Args: exp_key (str): the experiment ID """ # load the experiment comet_api = comet_ml.API(api_key=LOADING_API_KEY) experiment = comet_api.get_experiment(project_name=PROJECT_NAME, workspace=WORKSPACE, experiment=exp_key) # create the environment param_summary = experiment.get_parameters_summary() env_name = [a for a in param_summary if a['name'] == 'env'][0]['valueCurrent'] env = create_env(env_name) # create the agent asset_list = experiment.get_asset_list() agent_config_asset_list = [a for a in asset_list if 'agent_args' in a['fileName']] agent_args = None if len(agent_config_asset_list) > 0: # if we've saved the agent config dict, load it agent_args = experiment.get_asset(agent_config_asset_list[0]['assetId']) agent_args = json.loads(agent_args) agent_args = agent_args if 'opt_type' in agent_args['inference_optimizer_args'] else None agent = create_agent(env, agent_args=agent_args)[0] # get the list of checkpoint timesteps ckpt_asset_list = [a for a in asset_list if 'ckpt' in a['fileName']] ckpt_asset_names = [a['fileName'] for a in ckpt_asset_list] ckpt_timesteps = [int(s.split('ckpt_step_')[1].split('.ckpt')[0]) for s in ckpt_asset_names] ckpt_timesteps = list(np.sort(ckpt_timesteps)[::CKPT_SUBSAMPLE]) agent_kls = [] # initial episode using random init prev_episode, _, _ = collect_episode(env, agent) for ckpt_ind, ckpt_timestep in enumerate(ckpt_timesteps): # load the checkpoint print('Evaluating checkpoint ' + str(ckpt_ind + 1) + ' of ' + str(len(ckpt_timesteps))) load_checkpoint(agent, exp_key, ckpt_timestep) # evaluate agent KL print(' Evaluating agent KL...') agent_kls.append(estimate_agent_kl(env, agent, prev_episode)) print(' Done.') # collect an episode print(' Collecting episode...') prev_episode, _, _ = collect_episode(env, agent) print(' Done.') return {'steps': ckpt_timesteps, 'agent_kl': np.array(agent_kls)}
def clear_comet_ml_project(): import comet_ml # This is hardcoded in the `ray/ml/examples/upload_to_comet_ml.py` example comet_ml_project = "ray-air-example" api = comet_ml.API() workspace = api.get_default_workspace() experiments = api.get_experiments(workspace=workspace, project_name=comet_ml_project) api.delete_experiments([experiment.key for experiment in experiments])
def get_experiment(run_id): experiment_id = hashlib.sha1(run_id.encode("utf-8")).hexdigest() os.environ["COMET_EXPERIMENT_KEY"] = experiment_id api = comet_ml.API() # Assumes API key is set in config/env api_experiment = api.get_experiment_by_id(experiment_id) if api_experiment is None: return comet_ml.Experiment(project_name=PROJECT_NAME) else: return comet_ml.ExistingExperiment(project_name=PROJECT_NAME)
def init_logger(self, cfg): logger = None # Check to see if there is a key in environment: EXPERIMENT_KEY = cfg.experiment_key # First, let's see if we continue or start fresh: CONTINUE_RUN = cfg.resume if (EXPERIMENT_KEY is not None): # There is one, but the experiment might not exist yet: api = comet_ml.API() # Assumes API key is set in config/env try: api_experiment = api.get_experiment_by_id(EXPERIMENT_KEY) except Exception: api_experiment = None if api_experiment is not None: CONTINUE_RUN = True # We can get the last details logged here, if logged: # step = int(api_experiment.get_parameters_summary("batch")["valueCurrent"]) # epoch = int(api_experiment.get_parameters_summary("epochs")["valueCurrent"]) if CONTINUE_RUN: # 1. Recreate the state of ML system before creating experiment # otherwise it could try to log params, graph, etc. again # ... # 2. Setup the existing experiment to carry on: logger = comet_ml.ExistingExperiment( previous_experiment=EXPERIMENT_KEY, log_env_details=True, # to continue env logging log_env_gpu=True, # to continue GPU logging log_env_cpu=True, # to continue CPU logging auto_histogram_weight_logging=True, auto_histogram_gradient_logging=True, auto_histogram_activation_logging=True) # Retrieved from above APIExperiment # self.logger.set_epoch(epoch) else: # 1. Create the experiment first # This will use the COMET_EXPERIMENT_KEY if defined in env. # Otherwise, you could manually set it here. If you don't # set COMET_EXPERIMENT_KEY, the experiment will get a # random key! logger = comet_ml.Experiment( disabled=cfg.disabled, project_name=cfg.project, auto_histogram_weight_logging=True, auto_histogram_gradient_logging=True, auto_histogram_activation_logging=True) logger.add_tags(cfg.tags.split()) logger.log_parameters(self.cfg) return logger
def _download_model_weights_from_comet( experiment_key: str = "latest", download_path: str = "model/weights/srgan_generator_model_weights.npz", ) -> (int, float): """ Download DeepBedMap's Generator neural network model weights from Comet.ML By default, the model weights from the latest experimental run are downloaded Passing in an alternative experiment_key hash will download that one instead. Also returns the model's num_residual_blocks and residual_scaling hyperparameters. Uses Comet.ML's Python REST API class at https://www.comet.ml/docs/python-sdk/API/ Requires the COMET_REST_API_KEY environment variable to be set in the .env file """ comet_api = comet_ml.API(rest_api_key=base64.b64decode( s=os.environ["COMET_REST_API_KEY"])) # Get pointer to a DeepBedMap experiment on Comet ML if experiment_key == "latest": # Get list of DeepBedMap experiments project = comet_api.get(workspace="weiji14", project="deepbedmap") df = pd.io.json.json_normalize( data=project.data["experiments"].values()) # Get the key to the latest DeepBedMap experiment on Comet ML experiment_key = df.loc[ df["start_server_timestamp"].idxmax()].experiment_key experiment = comet_api.get(workspace="weiji14", project="deepbedmap", experiment=experiment_key) # Use key to access url to the experiment's asset which is the npz weight file assets = experiment.asset_list for asset in experiment.asset_list: if asset["fileName"].endswith( ".npz"): # make sure we pick the .npz file asset_id = asset["assetId"] break # Download the neural network weight file (npz format) to the right place! os.makedirs(name=os.path.dirname(download_path), exist_ok=True) with open(download_path, mode="wb") as model_weight_file: model_weight_file.write(experiment.get_asset(asset_id=asset_id)) # Get hyperparameters needed to recreate DeepBedMap model architecture properly parameters: dict = (pd.io.json.json_normalize( data=experiment.parameters).set_index( keys="name").valueCurrent.to_dict()) return int(parameters["num_residual_blocks"]), float( parameters["residual_scaling"])
def compare_policies(exp_key1, exp_key2, write_result=True): """ Compares the policies of two agents at the end of training. Args: exp_key1 (str): exp_key2 (str): write_result (bool) """ # load the experiments comet_api = comet_ml.API(api_key=LOADING_API_KEY) exp1 = comet_api.get_experiment(project_name=PROJECT_NAME, workspace=WORKSPACE, experiment=exp_key1) exp2 = comet_api.get_experiment(project_name=PROJECT_NAME, workspace=WORKSPACE, experiment=exp_key2) # create the environment param_summary = exp1.get_parameters_summary() env_name = [a for a in param_summary if a['name'] == 'env'][0]['valueCurrent'] env1 = create_env(env_name) env2 = create_env(env_name) # create the agents asset_list = exp1.get_asset_list() agent_config_asset_list = [a for a in asset_list if 'agent_args' in a['fileName']] agent_args = None if len(agent_config_asset_list) > 0: # if we've saved the agent config dict, load it agent_args = exp1.get_asset(agent_config_asset_list[0]['assetId']) agent_args = json.loads(agent_args) agent_args = agent_args if 'opt_type' in agent_args['inference_optimizer_args'] else None agent1 = create_agent(env1, agent_args=agent_args)[0] load_checkpoint(agent1, exp_key1) asset_list = exp2.get_asset_list() agent_config_asset_list = [a for a in asset_list if 'agent_args' in a['fileName']] agent_args = None if len(agent_config_asset_list) > 0: # if we've saved the agent config dict, load it agent_args = exp2.get_asset(agent_config_asset_list[0]['assetId']) agent_args = json.loads(agent_args) agent_args = agent_args if 'opt_type' in agent_args['inference_optimizer_args'] else None agent2 = create_agent(env1, agent_args=agent_args)[0] load_checkpoint(agent2, exp_key2) # evaluate the KL between policies kl12 = [] kl21 = [] agent1.reset(); agent1.eval() agent2.reset(); agent2.eval() state1 = env1.reset() state2 = env2.reset() for state_ind in range(N_STATES): # perform policy optimization on state1 action1 = agent1.act(state1) agent2.act(state1) kl = kl_divergence(agent1.approx_post, agent2.approx_post).sum().detach().item() kl12.append(kl) agent1.reset(); agent1.eval() agent2.reset(); agent2.eval() # perform policy optimization on state2 agent1.act(state2) action2 = agent2.act(state2) kl = kl_divergence(agent2.approx_post, agent1.approx_post).sum().detach().item() kl21.append(kl) # step the environments state1, _, done1, _ = env1.step(action1) state2, _, done2, _ = env2.step(action2) if done1: agent1.reset(); agent1.eval() state1 = env1.reset() done1 = False if done2: agent2.reset(); agent2.eval() state2 = env2.reset() done2 = False kls = {'kl12': kl12, 'kl21': kl21} if write_result: pickle.dump(kls, open('policy_kl_' + exp_key1 + '_' + exp_key2 + '.p', 'wb')) return kls
def score_bayes_trainer_harness(self): """ Returns ------- :return: ``None`` None, but results in saved models suitable for scoring and trained on all available data. """ self.__log.info("Starting generic score train loop") train_data = self.train_data val_data = self.val_data models = self.model out_path = self.out_path type_time_series = self.type_time_series param_search = self.param_search trainer = self.trainer api_key = self.api_key rest_api_key = self.rest_api_key workspace = self.workspace for models_ in models.get(type_time_series): for key, value in models_.items(): model_name = key model = value[0] model_kwarg = value[1] if param_search == 'bayes': search_space = GluonTSBayesEstimatorKwargs.BayesModelLookup.get( model_name) # comet-ml hyperparameter optimization configuration (bayes in this case) config = { "algorithm": "bayes", "spec": { "maxCombo": 5, # no of combinations to try "objective": "minimize", "metric": "loss", "seed": 42, "gridSize": 10, "minSampleSize": 100, "retryLimit": 20, "retryAssignLimit": 0, }, "name": "My Bayesian Search", "trials": 1, } config['parameters'] = search_space # current time timestr = time.strftime("%Y%m%d-%H%M%S") # comet-ml project name for the optimization project_name = f"optimizer-{model_name}-{timestr}" # initialize the comet-ml optimizer optimizer = Optimizer(config=config, api_key=api_key, project_name=project_name) # loop through the parameter combinations that the bayes optimizer suggests for experiment in optimizer.get_experiments(): # explicitly set the model parameters (should be generic for any model) if model_name == "SimpleFeedForward": hidden1 = experiment.get_parameter( "hidden_layer_size") hidden2 = experiment.get_parameter( "hidden2_layer_size") model_kwarg['num_hidden_dimensions'] = [ hidden1, hidden2 ] self.__log.info( f"model_kwarg['num_hidden_dimensions'] : {model_kwarg['num_hidden_dimensions']}" ) elif model_name == "DeepAREstimate": model_kwarg[ 'num_layers'] = experiment.get_parameter( "num_layers") model_kwarg[ 'num_cells'] = experiment.get_parameter( "num_cells") model_kwarg[ 'cell_type'] = experiment.get_parameter( "cell_type") model_kwarg[ 'dropout_rate'] = experiment.get_parameter( "dropout_rate") # set trainer params trainer.learning_rate = experiment.get_parameter( "learning_rate") trainer.batch_size = experiment.get_parameter( "batch_size") trainer.epochs = 2 # initialize model from the suggested hyperparameters model = model.from_hyperparameters(**model_kwarg) # set the trainer model.trainer = trainer self.__log.info( f'\n model.trainer.lr : {model.trainer.learning_rate}' ) self.__log.info( f'model.trainer.epochs : {model.trainer.epochs}\n') # train the model predictor = model.train(train_data) # make predictions forecast_it, ts_it = make_evaluation_predictions( dataset=val_data, # test dataset predictor=predictor, # predictor num_eval_samples= 1, # number of sample paths we want for evaluation ) # convert gluonts objects to lists forecasts = list(forecast_it) tss = list(ts_it) # get prediction length prediction_length = forecasts[0].mean.shape[0] y_test_ = list(val_data)[0]['target'] y_preds_ = forecasts[0].mean y_test_ = y_test_[-prediction_length:] mae_ = mean_absolute_error(y_test_, y_preds_) # Report the loss to comet experiment.log_metric("loss", mae_) experiment.end() # initialize comet REST API to retrieve the best hyperparameters comet_api = comet_ml.API(rest_api_key=rest_api_key) project = comet_api.get(workspace=workspace, project_name=optimizer. experiment_kwargs['project_name'].lower()) # get the experiment ids exp_ids = [x.id for x in project] scores_df = pd.DataFrame(index=exp_ids, columns=['metric']) # loop through the experiments within the comet project for exp_id in exp_ids: exp = comet_api.get( f"{workspace}/{project_name.lower()}/{exp_id}") scores_df.at[exp_id, 'metric'] = exp.get_metrics()[0]['metricValue'] scores_df.metric = scores_df.metric.map(float) # get experiment_id of the best score best_exp_id = scores_df.metric.idxmin() # get the best experiment exp = comet_api.get( f"{workspace}/{project_name.lower()}/{best_exp_id}") # get the best hyperparameters best_params = { x['name']: x['valueCurrent'] for x in exp.get_parameters_summary() if x['name'] != 'f' } # save best params in model_name-keyed dictionary for later use self.best_params[model_name] = best_params
def compare_goal_optimizers(model_exp_key, opt_exp_key=None, write_results=True, stochastic_model=False): """ Optimize random goal states using a model-based estimator. Train the policy optimizer online. Compare with other optimizers. Note: tailored to HalfCheetah-v2 environment currently. Args: model_exp_key (str): model-based experiment key opt_exp_key (str): optimizer experiment key. If None, trains from scratch write_results (bool): whether to pickle results directly stochastic_model (bool): whether to sample states or use mean estimate train_model (bool) whether to train the model online """ ## MODEL # load the model experiment comet_api = comet_ml.API(api_key=LOADING_API_KEY) experiment = comet_api.get_experiment(project_name=PROJECT_NAME, workspace=WORKSPACE, experiment=model_exp_key) # create the environment param_summary = experiment.get_parameters_summary() env_name = [a for a in param_summary if a['name'] == 'env'][0]['valueCurrent'] env = create_env(env_name) # create a synchronous env to parallelize training sync_env = SynchronousEnv(env, BATCH_SIZE) # create the agent asset_list = experiment.get_asset_list() agent_config_asset_list = [ a for a in asset_list if 'agent_args' in a['fileName'] ] agent_args = None if len(agent_config_asset_list) > 0: # if we've saved the agent config dict, load it agent_args = experiment.get_asset( agent_config_asset_list[0]['assetId']) agent_args = json.loads(agent_args) agent_args = agent_args if 'opt_type' in agent_args[ 'inference_optimizer_args'] else None agent = create_agent(env, agent_args=agent_args)[0] # also, load the most recent episode to sample goal states asset_times = [ asset['createdAt'] for asset in asset_list if 'state' in asset['fileName'] ] state_asset = [ a for a in asset_list if a['createdAt'] == max(asset_times) ][0] episode_states = json.loads(experiment.get_asset(state_asset['assetId'])) # load the checkpoint load_checkpoint(agent, model_exp_key) if stochastic_model: agent.q_value_estimator.state_variable.cond_likelihood.stochastic = True # swap out the value estimator for goal-based estimator gb_estimator = GoalBasedQEstimator() # copy over the dynamics model gb_estimator.state_likelihood_model = agent.q_value_estimator.state_likelihood_model gb_estimator.state_variable = agent.q_value_estimator.state_variable # set the estimator agent.q_value_estimator = gb_estimator agent.q_value_estimator.set_goal_std(GOAL_STD) # agent.alphas['pi'] = 0. total_results = { 'grad_based': None, 'cem': None, 'it_am': None, 'goal_cond': None } goals = [] print('Sampling goals...') for step_ind in range(N_TOTAL_STEPS): new_goal_states = np.stack([ episode_states[np.random.randint(0, 25)] for _ in range(BATCH_SIZE) ]) # goal_state = episode_states[goal_ind] new_goal_states = torch.from_numpy(new_goal_states).float().view( BATCH_SIZE, -1) new_goal_states[:, 8:] *= 0. if step_ind == 0: goal_state = new_goal_states else: # randomly change the goal state with some small probability flips = (torch.rand(BATCH_SIZE, 1) < GOAL_FLIP_PROB).float().repeat(1, new_goal_states.shape[-1]) goal_state = (1 - flips) * goal_state + flips * new_goal_states goals.append(goal_state) print('Evaluating gradient-based agent...') agent.inference_optimizer = GradientBasedInference(lr=1e-3, n_inf_iters=50) grad_based_results = collect_goal_optimization(agent, sync_env, goals) total_results['grad_based'] = grad_based_results print('Done.') # print('Evaluating CEM agent...') # agent.inference_optimizer = CEMInference(lr=1e-3, n_top_samples=10, n_inf_iters=50) # agent.n_action_samples = 100 # cem_results = collect_goal_optimization(agent, sync_env, goals) # total_results['cem'] = cem_results # print('Done.') print('Evaluating iterative amortized agent...') # create an iterative amortized optimizer inputs = ['params', 'grads', 'state'] n_input = 24 if 'state' in inputs: n_input += 17 network_args = { 'type': 'recurrent', 'n_layers': 2, 'inputs': inputs, 'n_units': 512, 'connectivity': 'highway', 'n_input': n_input } agent.inference_optimizer = IterativeInferenceModel( network_args=network_args, n_inf_iters=10) for m in agent.approx_post.models: agent.approx_post.models[m] = FullyConnectedLayer(512, 6) agent.approx_post.gates[m] = FullyConnectedLayer( 512, 6, non_linearity='sigmoid') agent.approx_post.update = 'iterative' # create a parameter optimizer for the inference model inference_parameters = [_ for _ in agent.inference_optimizer.parameters() ] + [_ for _ in agent.approx_post.parameters()] inf_optim = optim.Adam(inference_parameters, lr=3e-4) it_am_results = collect_goal_optimization(agent, sync_env, goals, inf_optim=inf_optim) total_results['it_am'] = it_am_results print('Done.') print('Evaluating goal-conditioned agent...') # create a direct, goal-conditioned network network_args = { 'type': 'fully_connected', 'n_layers': 2, 'inputs': ['state', 'goal'], 'n_units': 512, 'connectivity': 'highway', 'n_input': 17 + 17 } agent.inference_optimizer = DirectGoalInferenceModel( network_args=network_args) for m in agent.approx_post.models: agent.approx_post.models[m] = FullyConnectedLayer(512, 6) agent.approx_post.update = 'direct' # create a parameter optimizer for the inference model inference_parameters = [_ for _ in agent.inference_optimizer.parameters() ] + [_ for _ in agent.approx_post.parameters()] inf_optim = optim.Adam(inference_parameters, lr=3e-4) goal_cond_results = collect_goal_optimization(agent, sync_env, goals, inf_optim=inf_optim) total_results['goal_cond'] = goal_cond_results print('Done.') if write_results: pickle.dump(total_results, open('comp_goal_opt_' + model_exp_key + '.p', 'wb')) return total_results
def goal_optimization(model_exp_key, opt_exp_key=None, write_results=True): """ Optimize random goal states using a model-based estimator. Note: tailored to HalfCheetah-v2 environment currently. Args: model_exp_key (str): model-based experiment key opt_exp_key (str): optimizer experiment key write_results (bool): whether to pickle results directly """ # load the experiment comet_api = comet_ml.API(api_key=LOADING_API_KEY) experiment = comet_api.get_experiment(project_name=PROJECT_NAME, workspace=WORKSPACE, experiment=model_exp_key) # create the environment param_summary = experiment.get_parameters_summary() env_name = [a for a in param_summary if a['name'] == 'env'][0]['valueCurrent'] env = create_env(env_name) # create the agent asset_list = experiment.get_asset_list() agent_config_asset_list = [ a for a in asset_list if 'agent_args' in a['fileName'] ] agent_args = None if len(agent_config_asset_list) > 0: # if we've saved the agent config dict, load it agent_args = experiment.get_asset( agent_config_asset_list[0]['assetId']) agent_args = json.loads(agent_args) agent_args = agent_args if 'opt_type' in agent_args[ 'inference_optimizer_args'] else None agent = create_agent(env, agent_args=agent_args)[0] # also, load the most recent episode to sample goal states asset_times = [ asset['createdAt'] for asset in asset_list if 'state' in asset['fileName'] ] state_asset = [ a for a in asset_list if a['createdAt'] == max(asset_times) ][0] episode_states = json.loads(experiment.get_asset(state_asset['assetId'])) # load the checkpoint load_checkpoint(agent, model_exp_key) # load the optimizer if opt_exp_key is not None: # load the experiment comet_api = comet_ml.API(api_key=LOADING_API_KEY) opt_experiment = comet_api.get_experiment(project_name=PROJECT_NAME, workspace=WORKSPACE, experiment=opt_exp_key) # create the agent asset_list = opt_experiment.get_asset_list() agent_config_asset_list = [ a for a in asset_list if 'agent_args' in a['fileName'] ] agent_args = None if len(agent_config_asset_list) > 0: # if we've saved the agent config dict, load it agent_args = opt_experiment.get_asset( agent_config_asset_list[0]['assetId']) agent_args = json.loads(agent_args) agent_args = agent_args if 'opt_type' in agent_args[ 'inference_optimizer_args'] else None opt_agent = create_agent(env, agent_args=agent_args)[0] # load the checkpoint load_checkpoint(opt_agent, opt_exp_key) agent.inference_optimizer = opt_agent.inference_optimizer agent.inference_optimizer.n_inf_iters = 20 else: # create a gradient-based optimizer agent.inference_optimizer = GradientBasedInference(lr=1e-3, n_inf_iters=50) # swap out the value estimator for goal-based estimator gb_estimator = GoalBasedQEstimator() # copy over the dynamics model gb_estimator.state_likelihood_model = agent.q_value_estimator.state_likelihood_model gb_estimator.state_variable = agent.q_value_estimator.state_variable # set the estimator agent.q_value_estimator = gb_estimator agent.q_value_estimator.set_goal_std(GOAL_STD) # agent.alphas['pi'] = 0. # optimize goal states goal_states = [] traj_states = [] env_states = {'qpos': [], 'qvel': []} actions = [] inf_objectives = [] agent.reset() agent.eval() state = env.reset() if RENDER: env.render() goal_state = None # goal_ind = 0 print('Collecting goal-optimization episode...') for step_ind in range(N_TOTAL_STEPS): print('STEP: ' + str(step_ind)) if step_ind % GOAL_INTERVAL == 0: goal_state = episode_states[np.random.randint(0, 25)] # goal_state = episode_states[goal_ind] goal_state = torch.from_numpy(np.array(goal_state)).float().view( 1, -1) goal_state[:, 8:] *= 0. if not TRAJECTORY_FOLLOW: agent.q_value_estimator.set_goal_state(goal_state) # goal_ind += 1 if TRAJECTORY_FOLLOW: # define a sub-goal between current state and goal state delta_state = goal_state - state traj_state = state + 0.1 * delta_state agent.q_value_estimator.set_goal_state(traj_state) traj_states.append(traj_state) else: traj_states.append(goal_states) goal_states.append(goal_state) env_states['qpos'].append(copy.deepcopy(env.sim.data.qpos)) env_states['qvel'].append(copy.deepcopy(env.sim.data.qvel)) action = agent.act(state, eval=True) state, _, _, _ = env.step(action) inf_objectives.append(agent.inference_optimizer.estimated_objectives) # import ipdb; ipdb.set_trace() agent.inference_optimizer.reset(1) if RENDER: env.render() actions.append(action) print('Done.') # save the results results = { 'goal_states': goal_states, 'traj_states': traj_states, 'env_states': env_states, 'actions': actions } if write_results: pickle.dump(results, open('goal_opt_' + model_exp_key + '.p', 'wb')) return results
def goal_optimization_training(model_exp_key, opt_exp_key=None, write_results=True, stochastic_model=False, train_model=False): """ Optimize random goal states using a model-based estimator. Train the policy optimizer online. Note: tailored to HalfCheetah-v2 environment currently. Args: model_exp_key (str): model-based experiment key opt_exp_key (str): optimizer experiment key. If None, trains from scratch write_results (bool): whether to pickle results directly stochastic_model (bool): whether to sample states or use mean estimate train_model (bool) whether to train the model online """ # load the experiment comet_api = comet_ml.API(api_key=LOADING_API_KEY) experiment = comet_api.get_experiment(project_name=PROJECT_NAME, workspace=WORKSPACE, experiment=model_exp_key) # create the environment param_summary = experiment.get_parameters_summary() env_name = [a for a in param_summary if a['name'] == 'env'][0]['valueCurrent'] env = create_env(env_name) # create a synchronous env to parallelize training sync_env = SynchronousEnv(env, BATCH_SIZE) # create the agent asset_list = experiment.get_asset_list() agent_config_asset_list = [ a for a in asset_list if 'agent_args' in a['fileName'] ] agent_args = None if len(agent_config_asset_list) > 0: # if we've saved the agent config dict, load it agent_args = experiment.get_asset( agent_config_asset_list[0]['assetId']) agent_args = json.loads(agent_args) agent_args = agent_args if 'opt_type' in agent_args[ 'inference_optimizer_args'] else None agent = create_agent(env, agent_args=agent_args)[0] # also, load the most recent episode to sample goal states asset_times = [ asset['createdAt'] for asset in asset_list if 'state' in asset['fileName'] ] state_asset = [ a for a in asset_list if a['createdAt'] == max(asset_times) ][0] episode_states = json.loads(experiment.get_asset(state_asset['assetId'])) # load the checkpoint load_checkpoint(agent, model_exp_key) if stochastic_model: agent.q_value_estimator.state_variable.cond_likelihood.stochastic = True # load the optimizer if opt_exp_key is not None: # load the experiment comet_api = comet_ml.API(api_key=LOADING_API_KEY) opt_experiment = comet_api.get_experiment(project_name=PROJECT_NAME, workspace=WORKSPACE, experiment=opt_exp_key) # create the agent asset_list = opt_experiment.get_asset_list() agent_config_asset_list = [ a for a in asset_list if 'agent_args' in a['fileName'] ] agent_args = None if len(agent_config_asset_list) > 0: # if we've saved the agent config dict, load it agent_args = opt_experiment.get_asset( agent_config_asset_list[0]['assetId']) agent_args = json.loads(agent_args) agent_args = agent_args if 'opt_type' in agent_args[ 'inference_optimizer_args'] else None opt_agent = create_agent(env, agent_args=agent_args)[0] # load the checkpoint load_checkpoint(opt_agent, opt_exp_key) agent.inference_optimizer = opt_agent.inference_optimizer agent.inference_optimizer.n_inf_iters = 10 else: # create an iterative amortized optimizer n_input = 12 if ENCODING_TYPE == 'grads': inputs = ['params', 'grads'] n_input += 12 elif ENCODING_TYPE == 'errors': inputs = ['params', 'errors'] n_input += (17 + 17 + 6) n_units = 512 # network_args = {'type': 'fully_connected', # 'n_layers': 2, # 'inputs': inputs, # 'n_units': n_units, # 'connectivity': 'highway', # 'batch_norm': False, # 'non_linearity': 'elu', # 'dropout': None, # 'separate_networks': False, # 'n_input': n_input} network_args = { 'type': 'recurrent', 'n_layers': 2, 'inputs': inputs, 'n_units': n_units, 'connectivity': 'highway', 'batch_norm': False, 'dropout': None, 'separate_networks': False, 'n_input': n_input } agent.inference_optimizer = IterativeInferenceModel( network_args=network_args, n_inf_iters=5, encoding_type=ENCODING_TYPE) for m in agent.approx_post.models: agent.approx_post.models[m] = FullyConnectedLayer(n_units, 6) agent.approx_post.gates[m] = FullyConnectedLayer( n_units, 6, non_linearity='sigmoid') # create a parameter optimizer for the inference model inference_parameters = [_ for _ in agent.inference_optimizer.parameters() ] + [_ for _ in agent.approx_post.parameters()] param_opt = optim.Adam(inference_parameters, lr=3e-4) # swap out the value estimator for goal-based estimator gb_estimator = GoalBasedQEstimator() # copy over the dynamics model gb_estimator.state_likelihood_model = agent.q_value_estimator.state_likelihood_model gb_estimator.state_variable = agent.q_value_estimator.state_variable # set the estimator agent.q_value_estimator = gb_estimator agent.q_value_estimator.set_goal_std(GOAL_STD) # agent.alphas['pi'] = 0. model_param_opt = None if train_model: # create a parameter optimizer for the inference model model_parameters = [ _ for _ in agent.q_value_estimator.state_likelihood_model.parameters() ] + [_ for _ in agent.q_value_estimator.state_variable.parameters()] model_param_opt = optim.Adam(model_parameters, lr=3e-4) # optimize goal states goal_states = [] traj_states = [] env_states = {'qpos': [], 'qvel': []} actions = [] inf_objectives = [] state_log_likelihoods = [] state_squared_errors = [] state_locs = [] state_scales = [] model_cll_training = [] agent.reset(batch_size=BATCH_SIZE) agent.eval() state = sync_env.reset() if RENDER: env.render() goal_state = None state_likelihood = None # goal_ind = 0 print('Collecting goal-optimization episode...') for step_ind in range(N_TOTAL_STEPS): print('STEP: ' + str(step_ind)) # if step_ind % GOAL_INTERVAL == 0: if True: new_goal_states = np.stack([ episode_states[np.random.randint(0, 25)] for _ in range(BATCH_SIZE) ]) # goal_state = episode_states[goal_ind] new_goal_states = torch.from_numpy(new_goal_states).float().view( BATCH_SIZE, -1) new_goal_states[:, 8:] *= 0. if step_ind == 0: goal_state = new_goal_states else: # randomly change the goal state with some small probability flips = (torch.rand(BATCH_SIZE, 1) < GOAL_FLIP_PROB).float().repeat( 1, new_goal_states.shape[-1]) goal_state = (1 - flips) * goal_state + flips * new_goal_states if not TRAJECTORY_FOLLOW: agent.q_value_estimator.set_goal_state(goal_state) # goal_ind += 1 if TRAJECTORY_FOLLOW: # define a sub-goal between current state and goal state delta_state = goal_state - state traj_state = state + 0.1 * delta_state agent.q_value_estimator.set_goal_state(traj_state) traj_states.append(traj_state) else: traj_states.append(goal_states) goal_states.append(goal_state) qpos = np.stack( [copy.deepcopy(e.sim.data.qpos) for e in sync_env.envs]) qvel = np.stack( [copy.deepcopy(e.sim.data.qvel) for e in sync_env.envs]) env_states['qpos'].append(qpos) env_states['qvel'].append(qvel) action = agent.act(state, eval=True) state, _, _, _ = sync_env.step(action) inf_objectives.append(agent.inference_optimizer.estimated_objectives) if train_model: agent.q_value_estimator.generate(agent) cll = -agent.q_value_estimator.state_variable.cond_log_likelihood( state).view(-1, 1).mean() model_cll_training.append(cll.detach().item()) cll.backward() model_param_opt.step() if state_likelihood is not None: state_ll = state_likelihood.log_prob(state) state_log_likelihoods.append(state_ll) state_squared_error = (state_likelihood.loc - state).pow(2) state_squared_errors.append(state_squared_error) state_loc = agent.collector.distributions['state']['cond_like']['loc'][ -1] state_scale = agent.collector.distributions['state']['cond_like'][ 'scale'][-1] state_locs.append(state_loc) state_scales.append(state_scale) state_likelihood = Normal(state_loc, state_scale) # update the inference optimizer grads = [param.grad for param in inference_parameters] divide_gradients_by_value(grads, agent.inference_optimizer.n_inf_iters) divide_gradients_by_value(grads, BATCH_SIZE) param_opt.step() param_opt.zero_grad() agent.inference_optimizer.reset(BATCH_SIZE) if RENDER: env.render() actions.append(action) print('Done.') # save the results results = { 'goal_states': goal_states, 'traj_states': traj_states, 'env_states': env_states, 'actions': actions, 'inf_objectives': inf_objectives, 'state_locs': state_locs, 'state_scales': state_scales, 'state_log_likelihoods': state_log_likelihoods, 'state_squared_errors': state_squared_errors, 'model_cll_training': model_cll_training } if write_results: pickle.dump(results, open('goal_opt_' + model_exp_key + '.p', 'wb')) return results
def evaluate_estimator(exp_key, n_state_action, n_mc_samples, device_id=None): """ Evaluates the value estimator of a cached experiment throughout learning. Args: exp_key (str): the string of the comet experiment key n_state_action (int): number of state action pairs to evaluate n_mc_samples (int): number of Monte Carlo samples to estimate environment returns Returns dictionary containing: ckpt_timesteps [n_ckpts] value_estimates [n_ckpts, n_state_action, 1], direct_value_estimates [n_ckpts, n_state_action, 1] mc_estimates [n_ckpts, n_state_action, n_mc_samples] """ # load the experiment comet_api = comet_ml.API(api_key=LOADING_API_KEY) experiment = comet_api.get_experiment(project_name=PROJECT_NAME, workspace=WORKSPACE, experiment=exp_key) # create the corresponding environment param_summary = experiment.get_parameters_summary() env_name = [a for a in param_summary if a['name'] == 'env'][0]['valueCurrent'] env = create_env(env_name) # collect state-action samples using random policy print('Collecting ' + str(n_state_action) + ' state-action pairs...') sa_pairs = {'states': [], 'env_states': [], 'actions': []} state = env.reset() env_state = (copy.deepcopy(env.sim.data.qpos), copy.deepcopy(env.sim.data.qvel)) for _ in range(n_state_action): action = env.action_space.sample() next_state, reward, done, _ = env.step(action) sa_pairs['states'].append(state) sa_pairs['env_states'].append(env_state) sa_pairs['actions'].append(torch.from_numpy(action).view(1, -1)) state = env.reset() if done else next_state env_state = (copy.deepcopy(env.sim.data.qpos), copy.deepcopy(env.sim.data.qvel)) print('Done.') # enumerate state-action pairs, estimating returns at each stage of learning asset_list = experiment.get_asset_list() agent_config_asset_list = [ a for a in asset_list if 'agent_args' in a['fileName'] ] agent_args = None if len(agent_config_asset_list) > 0: # if we've saved the agent config dict, load it agent_args = experiment.get_asset( agent_config_asset_list[0]['assetId']) agent_args = json.loads(agent_args) agent_args = agent_args if 'opt_type' in agent_args[ 'inference_optimizer_args'] else None agent = create_agent(env, agent_args=agent_args, device_id=device_id)[0] # get the list of checkpoint timesteps ckpt_asset_list = [a for a in asset_list if 'ckpt' in a['fileName']] ckpt_asset_names = [a['fileName'] for a in ckpt_asset_list] ckpt_timesteps = [ int(s.split('ckpt_step_')[1].split('.ckpt')[0]) for s in ckpt_asset_names ] # convert n_mc_samples to a round number of batches n_batches = math.ceil(n_mc_samples / ROLLOUT_BATCH_SIZE) n_mc_samples = ROLLOUT_BATCH_SIZE * n_batches # TODO: the first dimension should be divided by CKPT_SUBSAMPLE value_estimates = np.zeros((len(ckpt_timesteps), n_state_action, 1)) direct_value_estimates = np.zeros((len(ckpt_timesteps), n_state_action, 1)) mc_estimates = np.zeros( (len(ckpt_timesteps), n_state_action, n_mc_samples)) # iterate over sub-sampled checkpoint timesteps, evaluating ckpt_timesteps = list(np.sort(ckpt_timesteps)[::CKPT_SUBSAMPLE]) for ckpt_ind, ckpt_timestep in enumerate(ckpt_timesteps): # load the checkpoint print('Evaluating checkpoint ' + str(ckpt_ind + 1) + ' of ' + str(len(ckpt_timesteps))) load_checkpoint(agent, exp_key, ckpt_timestep) # get value estimate and estimate returns for the state-action pairs for sa_ind, (env_state, state, act) in enumerate( zip(sa_pairs['env_states'], sa_pairs['states'], sa_pairs['actions'])): t_start = time.time() action_value_estimate = get_agent_value_estimate(agent, state, act) value_estimates[ckpt_ind, sa_ind, :] = action_value_estimate['estimate'] direct_value_estimates[ckpt_ind, sa_ind, :] = action_value_estimate['direct'] returns = estimate_monte_carlo_return(env, agent, env_state, state, act, n_batches) mc_estimates[ckpt_ind, sa_ind, :] = returns if sa_ind % 1 == 0: print(' Evaluated ' + str(sa_ind + 1) + ' of ' + str(len(sa_pairs['states'])) + ' state-action pairs.') print(' Duration: ' + '{:.2f}'.format(time.time() - t_start) + ' s / state-action pair.') # TODO: log the value estimates to comet (need to json-ify the numpy arrays) # prev_exp = comet_ml.ExistingExperiment(api_key=LOGGING_API_KEY, # previous_experiment=exp_key) # prev_exp.log_asset_data(value_estimates, name='value_estimates') # prev_exp.log_asset_data(direct_value_estimates, name='direct_value_estimates') # prev_exp.log_asset_data(mc_estimates, name='mc_estimates') return { 'ckpt_timesteps': ckpt_timesteps, 'value_estimates': value_estimates, 'direct_value_estimates': direct_value_estimates, 'mc_estimates': mc_estimates }