def __init__(self, spec_tree, device): self.spec_tree = spec_tree self.device = device # get spec values self.enabled = spec_tree['enabled'] behavior_policy_range = spec_tree['behavior_policy_range'] self.behavior_policy_range = range(behavior_policy_range['min'], behavior_policy_range['max'] + 1, behavior_policy_range['step']) self.temperature = spec_tree['temperature'] self.num_episodes = spec_tree['num_episodes'] self.save_data_to = resolve_path(spec_tree['save_data_to']) self.dataset_seed = spec_tree['dataset_seed'] # self.dataset_seed = range(dataset_seed['start'], dataset_seed['stop'], dataset_seed['step']) self.behavior_policy_collection = [] dummy_env = self.spec_tree.create_component('environment') self.obs_space = dummy_env.observation_space self.action_space = dummy_env.action_space for behavior_policy_id in self.behavior_policy_range: behavior_policy = self.spec_tree.create_component( 'model', self.obs_space, self.action_space) model_path = self.spec_tree['load_model_from'] + 'policy_' + str( behavior_policy_id) + '.pt' behavior_policy.load_model(model_path) self.behavior_policy_collection.append(behavior_policy) data_path = spec_tree['save_data_to']+'/behavior_'+str(min(self.behavior_policy_range))+\ '_'+str(max(self.behavior_policy_range))+'/n_eps'+str(self.num_episodes)+'/horizon'+str(dummy_env.max_ep_len)+'/' ensure_dir_exists(file=data_path)
def add_output_file(self, file_path): # Add one console output file. file_path = os.path.abspath(file_path) self.file_paths.append(file_path) utils.ensure_dir_exists(file=file_path) output_file = open(file_path, 'w') output_file.close()
def __init__(self, spec_tree, run_spec_path, hp_handler): self.spec_tree = spec_tree self.hp_handler = hp_handler # Get spec values. self.cuda = spec_tree['cuda'] self.log_to_tensorboard = spec_tree['log_to_tensorboard'] self.experiment_path = spec_tree['experiment_path'] # Begin writing two copies of the console output. logger.add_output_file('console.txt') logger.add_output_file( os.path.join(self.experiment_path, 'console.txt')) # Check the experiment_path. if not self.experiment_path.startswith('../results'): logger.write_line( "WARNING: experiment_path \'{}\' (found in runspec) does not begin with '../results'. " "Job results will not be mirrored to Azure Storage.".format( self.experiment_path)) # Copy the launched runspec to results folder dest = pjoin(self.experiment_path, os.path.basename(run_spec_path)) if run_spec_path != dest: copyfile(run_spec_path, dest) # Is this app running as part of a launched job? in_job = os.getenv("XT_RUN_NAME") if in_job: # Yes. Don't create another job launcher. self.job_launcher = None else: # No. Try to instantiate a job launcher. self.job_launcher = spec_tree.create_component('job_launcher') if self.job_launcher and self.job_launcher.hp_tuning: self.hp_handler.write_hp_config_file() # Write the top portion of the repro spec tree to two files, # one in the rl_nexus dir, and the other in the experiment_path dir. local_repro_spec_path = 'repro_spec.yaml' exper_repro_spec_path = os.path.join(self.experiment_path, 'repro_spec.yaml') utils.ensure_dir_exists(file=exper_repro_spec_path) self.repro_spec_paths = (local_repro_spec_path, exper_repro_spec_path) self.write_to_repro_spec(self.spec_tree, '', 'w') self.write_to_repro_spec('\nprocessing_stages:\n', '', 'a')
def __init__(self, spec_tree, device): self.spec_tree = spec_tree self.device = device # Get spec values self.enabled = spec_tree['enabled'] self.save_model_to = resolve_path(spec_tree['save_model_to']) self.save_logs_to = resolve_path(spec_tree['save_logs_to']) self.max_iterations = spec_tree['max_iterations'] self.iters_per_report = spec_tree['iters_per_report'] self.get_action_from_env = spec_tree['get_action_from_env'] self.train = spec_tree['train'] self.render = spec_tree['render'] self.model_load_paths_dict = { 'load_model_from': resolve_path(spec_tree['load_model_from']), 'load_backbone_from': resolve_path(spec_tree['load_backbone_from']), 'load_core_from': resolve_path(spec_tree['load_core_from']), 'load_embedding_from': resolve_path(spec_tree['load_embedding_from']), 'load_head_from': resolve_path(spec_tree['load_head_from']) } self.model_save_paths_dict = { 'save_model_to': resolve_path(spec_tree['save_model_to']), 'save_backbone_to': resolve_path(spec_tree['save_backbone_to']), 'save_core_to': None, 'save_embedding_to': None, 'save_head_to': resolve_path(spec_tree['save_head_to']) } # Environment component self.environment = spec_tree.create_component('environment') # import pdb; pdb.set_trace() # Agent component self.agent = spec_tree.create_component('agent', self.environment.observation_space, self.environment.action_space, device) # import pdb; pdb.set_trace() # XT related self.xt_run_name = os.getenv("XT_RUN_NAME", None) self.xt_run = None if self.xt_run_name: from xtlib.run import Run as XTRun self.xt_run = XTRun() # log hyperparameter values to XT. (in progress) #hd = cf.get_hparam_dict() #self.xt_run.log_hparams( hd ) if self.agent.loop_type() is not 'ray_loop': evaluation_num_episodes = spec_tree['evaluation_num_episodes'] assert evaluation_num_episodes == 0, 'Only rllib\'s algorithm implementations support intra-stage evaluation.' self.agent.load_model(self.model_load_paths_dict, True) if self.save_model_to: ensure_dir_exists(file=self.save_model_to) logger.write_line("Saving models to {}".format(self.save_model_to)) # Switch the agent into eval mode if requested if not self.train and not spec_tree['disable_eval']: if self.agent.model is not None: self.agent.model.eval() self.metric_data_list = []
def __init__(self, spec_tree, device): self.spec_tree = spec_tree self.seed = spec_tree['algo_seed'] self.dataset_seed = spec_tree['dataset_seed'] self.use_ray = spec_tree['use_ray'] self.ope = spec_tree['offline_estimators'] assert isinstance(self.ope, list), 'the estimators should be described as a list' # self.metrics = {} # for estimator in self.ope: # metric = Metric(short_name=estimator, long_name=estimator,formatting_string='{:5.2f}', higher_is_better=False) # self.metrics[estimator] = metric self.horizon = spec_tree['horizon'] self.num_episodes = spec_tree['num_episodes'] self.gamma = spec_tree['gamma'] self.normalization = spec_tree['normalization'] self.target_temp = spec_tree['target_policy_temperature'] # reset the environment seed spec_tree['environment']['seed'] = self.seed assert self.horizon == spec_tree['environment'][ 'max_ep_len'], 'horizon and max_ep_len of environment do not match' assert not spec_tree['environment'][ 'fixed_length_episode'], 'should turn fixed length episode to false for on policy evaluation purpose' self.environment = spec_tree.create_component('environment') obs_space = self.environment.observation_space action_space = self.environment.action_space self.obs_dim = obs_space.shape[0] self.act_dim = action_space.n #* prepare the result path self.behavior_policy_type = spec_tree['behavior_policy_type'] if self.behavior_policy_type == 'random': behavior_type_string = '/behavior_uniform_random' elif self.behavior_policy_type == 'random_network': behavior_type_string = '/behavior_random_network' elif self.behavior_policy_type == 'epsilon_greedy': self.behavior_min_id = spec_tree['behavior_policy_range']['min'] self.behavior_max_id = spec_tree['behavior_policy_range']['max'] behavior_type_string = 'behavior_epsilon_greedy_'+str(self.behavior_min_id)+\ '_'+str(self.behavior_max_id) else: self.behavior_min_id = spec_tree['behavior_policy_range']['min'] self.behavior_max_id = spec_tree['behavior_policy_range']['max'] behavior_type_string = '/behavior_'+str(self.behavior_min_id)+\ '_'+str(self.behavior_max_id) self.debug_mode = spec_tree['debug_mode'] self.result_path_prefix = spec_tree['save_results_to'] + behavior_type_string + '/target_{}_temp{}/n_eps{}/horizon{}/seed'.format(\ spec_tree['target_policy_id'], self.target_temp, self.num_episodes, self.horizon) ensure_dir_exists(file=self.result_path_prefix) #* prepare the data path (if reading from an external data set) self.read_data_from_file = spec_tree['read_data_from_file'] self.dataset_path_prefix = None if self.read_data_from_file: self.dataset_path_prefix = spec_tree['load_data_from'] + behavior_type_string +'/n_eps{}/horizon{}/seed'.format(\ self.num_episodes, self.horizon) target_policy_net_path = spec_tree['load_model_from']+ '/policy_' +\ str(spec_tree['target_policy_id'])+'.pt' # self.tf_policy_net = convert_policy_network(spec_tree,obs_space,action_space, temperature = self.target_temp, path = target_policy_net_path) self.target_policy_net = spec_tree.create_component( 'model', obs_space, action_space) self.target_policy_net.temperature = self.target_temp self.target_policy_net.load_model(target_policy_net_path) # self.target_model_weights_list = extract_model_weights(self.target_policy_net.model, self.obs_dim, self.act_dim) self.target_model_weights_list = convert_torch_model_weights_to_list( self.target_policy_net.model) on_policy_num_eps = spec_tree['on_policy_eval_num_episodes'] self.value_true = evaluate_on_policy(self.environment, self.target_policy_net, num_episodes=on_policy_num_eps, gamma=self.gamma) self.hidden_layers_net = self.spec_tree['model']['fcnet_hiddens'] self.activation_net = self.spec_tree['model']['fcnet_activation'] self.data = None
def __init__(self, spec_tree, device): self.spec_tree = spec_tree self.device = device # get spec values self.enabled = spec_tree['enabled'] self.behavior_policy_type = spec_tree['behavior_policy_type'] if self.behavior_policy_type == 'random': self.behavior_policy_range = None elif self.behavior_policy_type == 'random_network': self.behavior_policy_range = None elif self.behavior_policy_type == 'range' or self.behavior_policy_type == 'range_tabular': behavior_policy_range = spec_tree['behavior_policy_range'] self.behavior_policy_range = range( behavior_policy_range['min'], behavior_policy_range['max'] + 1, behavior_policy_range['step']) elif self.behavior_policy_type == 'epsilon_greedy': behavior_policy_range = spec_tree['behavior_policy_range'] self.behavior_policy_range = range( behavior_policy_range['min'], behavior_policy_range['max'] + 1, behavior_policy_range['step']) elif self.behavior_policy_type == 'single': behavior_policy_range = spec_tree['behavior_policy_range'] assert behavior_policy_range['min'] == behavior_policy_range['max'] assert behavior_policy_range['step'] == 1 self.behavior_policy_range = [behavior_policy_range['min']] self.temperature = spec_tree['temperature'] self.num_episodes = spec_tree['num_episodes'] self.save_data_to = resolve_path(spec_tree['save_data_to']) self.dataset_seed = spec_tree['dataset_seed'] # self.dataset_seed = range(dataset_seed['start'], dataset_seed['stop'], dataset_seed['step']) self.behavior_policy_collection = [] dummy_env = self.spec_tree.create_component('environment') self.obs_space = dummy_env.observation_space self.action_space = dummy_env.action_space if self.behavior_policy_type == 'range_tabular': for behavior_policy_id in self.behavior_policy_range: policy_path = self.spec_tree['load_model_from'] + 'pi' + str( behavior_policy_id) + '.npy' q_table_path = self.spec_tree['load_model_from'] + 'q' + str( behavior_policy_id) + '.npy' behavior_policy = np.load(policy_path) q_table = np.load(q_table_path) self.behavior_policy_collection.append(behavior_policy) else: if self.behavior_policy_range: assert self.behavior_policy_type is not 'random' assert self.behavior_policy_type is not 'random_network' for behavior_policy_id in self.behavior_policy_range: behavior_policy = self.spec_tree.create_component( 'model', self.obs_space, self.action_space) model_path = self.spec_tree[ 'load_model_from'] + 'policy_' + str( behavior_policy_id) + '.pt' behavior_policy.load_model(model_path) self.behavior_policy_collection.append(behavior_policy) elif self.behavior_policy_type == 'random_network': behavior_policy = self.spec_tree.create_component( 'model', self.obs_space, self.action_space) self.behavior_policy_collection.append(behavior_policy) data_path = None self.save_data = spec_tree['save_data'] if self.save_data: if self.behavior_policy_type == 'random': behavior_type_string = '/behavior_uniform_random' elif self.behavior_policy_type == 'random_network': behavior_type_string = '/behavior_random_network' elif self.behavior_policy_type == 'epsilon_greedy': behavior_type_string = '/behavior_epsilon_greedy_'+str(min(self.behavior_policy_range))+\ '_'+str(max(self.behavior_policy_range)) else: behavior_type_string = '/behavior_'+str(min(self.behavior_policy_range))+\ '_'+str(max(self.behavior_policy_range)) self.data_path = spec_tree['save_data_to']+ behavior_type_string +\ '/n_eps'+str(self.num_episodes)+'/horizon'+str(dummy_env.max_ep_len)+'/' ensure_dir_exists(file=self.data_path)
def save_model_in_progress(self, save_paths_dict, policy_tag): save_path = save_paths_dict['save_model_to']+'_'+str(policy_tag)+".pt" ensure_dir_exists(file=save_path) torch.save(self.policy_network.state_dict(), save_path)
def __init__(self, spec_tree, device): self.spec_tree = spec_tree # self.seed = eval('random.'+spec_tree['seed']) # self.seed = random.randint(spec_tree['algo_seed']) self.seed = spec_tree['algo_seed'] self.dataset_seed = spec_tree['dataset_seed'] self.use_ray = spec_tree['use_ray'] self.ope = spec_tree['offline_estimators'] assert isinstance(self.ope, list), 'the estimators should be described as a list' # self.metrics = {} # for estimator in self.ope: # metric = Metric(short_name=estimator, long_name=estimator,formatting_string='{:5.2f}', higher_is_better=False) # self.metrics[estimator] = metric self.horizon = spec_tree['horizon'] self.num_episodes = spec_tree['num_episodes'] self.gamma = spec_tree['gamma'] self.normalization = spec_tree['normalization'] self.target_temp = spec_tree['target_policy_temperature'] # reset the environment seed spec_tree['environment']['seed'] = self.seed assert self.horizon == spec_tree['environment']['max_ep_len'], 'horizon and max_ep_len of environment do not match' assert not spec_tree['environment']['fixed_length_episode'], 'should turn fixed length episode to false for on policy evaluation purpose' self.environment = spec_tree.create_component('environment') obs_space = self.environment.observation_space action_space = self.environment.action_space self.obs_dim = obs_space.shape[0] self.act_dim = action_space.n # locate off-line dataset # dataset_path = spec_tree['load_data_from'] + '/' + self.environment.name+'/behavior_'+str(behavior_policy_min_id)+'_'+\ # str(behavior_policy_max_id)+'/n_eps'+str(self.num_episodes)+'/horizon'+str(self.horizon)+'/seed'+str(self.dataset_seed)+'.h5' # dataset_path = spec_tree['load_data_from'] + '/behavior_' + str(self.behavior_min_id) + '_' +\ # str(self.behavior_max_id)+'/n_eps'+str(self.num_episodes)+'/horizon'+str(self.horizon)+'/seed'+str(self.dataset_seed)+'.h5' target_policy_net_path = spec_tree['load_model_from']+ '/policy_' +\ str(spec_tree['target_policy_id'])+'.pt' #* prepare the data and result path self.behavior_min_id = spec_tree['behavior_policy_range']['min'] self.behavior_max_id = spec_tree['behavior_policy_range']['max'] # self.dataset_path_prefix = spec_tree['load_data_from'] + '/behavior_' + str(self.behavior_min_id) + '_' +\ # str(self.behavior_max_id)+'/n_eps'+str(self.num_episodes)+'/horizon'+str(self.horizon)+'/seed' self.dataset_path_prefix = spec_tree['load_data_from'] + '/behavior_{}_{}/n_eps{}/horizon{}/seed'.format(self.behavior_min_id,\ self.behavior_max_id, self.num_episodes, self.horizon) # self.result_path_prefix = spec_tree['save_results_to'] + '/behavior_' + str(self.behavior_min_id) + '_' +\ # str(self.behavior_max_id)+'/target_'+str(spec_tree['target_policy_id'])+'_temp'+str(self.target_temp)+'/n_eps'+str(self.num_episodes)+'/horizon'+str(self.horizon)+'/seed' self.result_path_prefix = spec_tree['save_results_to'] + '/behavior_{}_{}/target_{}_temp{}/n_eps{}/horizon{}/seed'.format(\ self.behavior_min_id, self.behavior_max_id, spec_tree['target_policy_id'], self.target_temp, self.num_episodes, self.horizon) ensure_dir_exists(file=self.result_path_prefix) # sess = make_session() # sess.__enter__() # self.tf_policy_net = convert_policy_network(spec_tree,obs_space,action_space, temperature = self.target_temp, path = target_policy_net_path) self.target_policy_net = spec_tree.create_component('model', obs_space, action_space) self.target_policy_net.temperature = self.target_temp self.target_policy_net.load_model(target_policy_net_path) # self.target_model_weights_list = extract_model_weights(self.target_policy_net.model, self.obs_dim, self.act_dim) self.target_model_weights_list = convert_torch_model_weights_to_list(self.target_policy_net.model) # diff = 0 # pdb.set_trace() # for i in range(len(weights_list)): # if len(weights_list[i])>0: # diff += ((weights_list[i][0] - self.target_model_weights_list[i][0])**2).sum() # diff += ((weights_list[i][1] - self.target_model_weights_list[i][1])**2).sum() # pdb.set_trace() # data = read_batch_experience(dataset_path, self.target_policy_net, self.num_episodes, self.target_temp, self.horizon, self.gamma) on_policy_num_eps = spec_tree['on_policy_eval_num_episodes'] self.value_true = evaluate_on_policy(self.environment, self.target_policy_net, num_episodes = on_policy_num_eps, gamma = self.gamma) # if self.normalization == 'std_norm': # #* whiten data # obs_mean = np.mean(data['obs'], axis=0, keepdims = True) # obs_std = np.std(data['obs'], axis = 0, keepdims = True) # data['obs'] = (data['obs'] - obs_mean) / obs_std # data['next_obs'] = (data['next_obs'] - obs_mean) / obs_std # data['init_obs'] = (data['init_obs'] - obs_mean) / obs_std # data['term_obs'] = (data['term_obs'] - obs_mean) / obs_std # self.norm_performed = {'type': self.normalization, 'shift': obs_mean, 'scale': obs_std} # else: # self.norm_performed = {'type': None, 'shift': None, 'scale': None} # # data['init_obs'] = data['obs'][::self.horizon] # # data['init_acts'] = data['acts'][::self.horizon] # # data['term_obs'] = data['next_obs'][self.horizon-1::self.horizon] # data['target_prob_init_obs'] = data['target_prob_obs'][::self.horizon] # data['target_prob_term_obs'] = data['target_prob_next_obs'][self.horizon-1::self.horizon] # data['next_acts'] = data['acts'].copy() # for i in range(self.num_episodes): # data['next_acts'][i*self.horizon:(i+1)*self.horizon-1] = data['acts'][i*self.horizon+1:(i+1)*self.horizon].copy() # self.data = data # import pdb; pdb.set_trace() self.hidden_layers_net = self.spec_tree['model']['fcnet_hiddens'] self.activation_net = self.spec_tree['model']['fcnet_activation']