Beispiel #1
0
    def __init__(self, spec_tree, device):
        self.spec_tree = spec_tree
        self.device = device

        # get spec values
        self.enabled = spec_tree['enabled']
        behavior_policy_range = spec_tree['behavior_policy_range']
        self.behavior_policy_range = range(behavior_policy_range['min'],
                                           behavior_policy_range['max'] + 1,
                                           behavior_policy_range['step'])
        self.temperature = spec_tree['temperature']
        self.num_episodes = spec_tree['num_episodes']
        self.save_data_to = resolve_path(spec_tree['save_data_to'])
        self.dataset_seed = spec_tree['dataset_seed']
        # self.dataset_seed = range(dataset_seed['start'], dataset_seed['stop'], dataset_seed['step'])

        self.behavior_policy_collection = []
        dummy_env = self.spec_tree.create_component('environment')
        self.obs_space = dummy_env.observation_space
        self.action_space = dummy_env.action_space

        for behavior_policy_id in self.behavior_policy_range:
            behavior_policy = self.spec_tree.create_component(
                'model', self.obs_space, self.action_space)
            model_path = self.spec_tree['load_model_from'] + 'policy_' + str(
                behavior_policy_id) + '.pt'
            behavior_policy.load_model(model_path)
            self.behavior_policy_collection.append(behavior_policy)

        data_path = spec_tree['save_data_to']+'/behavior_'+str(min(self.behavior_policy_range))+\
            '_'+str(max(self.behavior_policy_range))+'/n_eps'+str(self.num_episodes)+'/horizon'+str(dummy_env.max_ep_len)+'/'
        ensure_dir_exists(file=data_path)
Beispiel #2
0
 def add_output_file(self, file_path):
     # Add one console output file.
     file_path = os.path.abspath(file_path)
     self.file_paths.append(file_path)
     utils.ensure_dir_exists(file=file_path)
     output_file = open(file_path, 'w')
     output_file.close()
Beispiel #3
0
    def __init__(self, spec_tree, run_spec_path, hp_handler):
        self.spec_tree = spec_tree
        self.hp_handler = hp_handler

        # Get spec values.
        self.cuda = spec_tree['cuda']
        self.log_to_tensorboard = spec_tree['log_to_tensorboard']
        self.experiment_path = spec_tree['experiment_path']

        # Begin writing two copies of the console output.
        logger.add_output_file('console.txt')
        logger.add_output_file(
            os.path.join(self.experiment_path, 'console.txt'))

        # Check the experiment_path.
        if not self.experiment_path.startswith('../results'):
            logger.write_line(
                "WARNING: experiment_path \'{}\' (found in runspec) does not begin with '../results'. "
                "Job results will not be mirrored to Azure Storage.".format(
                    self.experiment_path))

        # Copy the launched runspec to results folder
        dest = pjoin(self.experiment_path, os.path.basename(run_spec_path))
        if run_spec_path != dest:
            copyfile(run_spec_path, dest)

        # Is this app running as part of a launched job?
        in_job = os.getenv("XT_RUN_NAME")
        if in_job:
            # Yes. Don't create another job launcher.
            self.job_launcher = None
        else:
            # No. Try to instantiate a job launcher.
            self.job_launcher = spec_tree.create_component('job_launcher')
            if self.job_launcher and self.job_launcher.hp_tuning:
                self.hp_handler.write_hp_config_file()

        # Write the top portion of the repro spec tree to two files,
        # one in the rl_nexus dir, and the other in the experiment_path dir.
        local_repro_spec_path = 'repro_spec.yaml'
        exper_repro_spec_path = os.path.join(self.experiment_path,
                                             'repro_spec.yaml')
        utils.ensure_dir_exists(file=exper_repro_spec_path)
        self.repro_spec_paths = (local_repro_spec_path, exper_repro_spec_path)
        self.write_to_repro_spec(self.spec_tree, '', 'w')
        self.write_to_repro_spec('\nprocessing_stages:\n', '', 'a')
Beispiel #4
0
    def __init__(self, spec_tree, device):
        self.spec_tree = spec_tree
        self.device    = device

        # Get spec values
        self.enabled              = spec_tree['enabled']
        self.save_model_to        = resolve_path(spec_tree['save_model_to'])
        self.save_logs_to         = resolve_path(spec_tree['save_logs_to'])
        self.max_iterations       = spec_tree['max_iterations']
        self.iters_per_report     = spec_tree['iters_per_report']
        self.get_action_from_env  = spec_tree['get_action_from_env']
        self.train                = spec_tree['train']
        self.render               = spec_tree['render']
        self.model_load_paths_dict = {
            'load_model_from':      resolve_path(spec_tree['load_model_from']),
            'load_backbone_from':   resolve_path(spec_tree['load_backbone_from']),
            'load_core_from':       resolve_path(spec_tree['load_core_from']),
            'load_embedding_from':  resolve_path(spec_tree['load_embedding_from']),
            'load_head_from':       resolve_path(spec_tree['load_head_from'])
        }
        self.model_save_paths_dict = {
            'save_model_to':        resolve_path(spec_tree['save_model_to']),
            'save_backbone_to':     resolve_path(spec_tree['save_backbone_to']),
            'save_core_to':         None,
            'save_embedding_to':    None,
            'save_head_to':         resolve_path(spec_tree['save_head_to'])
        }

        # Environment component
        self.environment = spec_tree.create_component('environment')
        # import pdb; pdb.set_trace()
        # Agent component
        self.agent = spec_tree.create_component('agent',
                                                self.environment.observation_space,
                                                self.environment.action_space,
                                                device)
        # import pdb; pdb.set_trace()
        # XT related
        self.xt_run_name = os.getenv("XT_RUN_NAME", None)
        self.xt_run = None
        if self.xt_run_name:
            from xtlib.run import Run as XTRun
            self.xt_run = XTRun()
            # log hyperparameter values to XT. (in progress)
            #hd = cf.get_hparam_dict()
            #self.xt_run.log_hparams( hd )

        if self.agent.loop_type() is not 'ray_loop':
            evaluation_num_episodes = spec_tree['evaluation_num_episodes']
            assert evaluation_num_episodes == 0, 'Only rllib\'s algorithm implementations support intra-stage evaluation.' 
            self.agent.load_model(self.model_load_paths_dict, True)

        if self.save_model_to:
            ensure_dir_exists(file=self.save_model_to)
            logger.write_line("Saving models to {}".format(self.save_model_to))

        # Switch the agent into eval mode if requested
        if not self.train and not spec_tree['disable_eval']:
            if self.agent.model is not None:
                self.agent.model.eval()

        self.metric_data_list = []
Beispiel #5
0
    def __init__(self, spec_tree, device):
        self.spec_tree = spec_tree

        self.seed = spec_tree['algo_seed']

        self.dataset_seed = spec_tree['dataset_seed']
        self.use_ray = spec_tree['use_ray']

        self.ope = spec_tree['offline_estimators']
        assert isinstance(self.ope,
                          list), 'the estimators should be described as a list'
        # self.metrics = {}
        # for estimator in self.ope:
        #     metric = Metric(short_name=estimator, long_name=estimator,formatting_string='{:5.2f}', higher_is_better=False)
        #     self.metrics[estimator] = metric
        self.horizon = spec_tree['horizon']
        self.num_episodes = spec_tree['num_episodes']
        self.gamma = spec_tree['gamma']
        self.normalization = spec_tree['normalization']
        self.target_temp = spec_tree['target_policy_temperature']

        # reset the environment seed
        spec_tree['environment']['seed'] = self.seed
        assert self.horizon == spec_tree['environment'][
            'max_ep_len'], 'horizon and max_ep_len of environment do not match'
        assert not spec_tree['environment'][
            'fixed_length_episode'], 'should turn fixed length episode to false for on policy evaluation purpose'
        self.environment = spec_tree.create_component('environment')
        obs_space = self.environment.observation_space
        action_space = self.environment.action_space

        self.obs_dim = obs_space.shape[0]
        self.act_dim = action_space.n

        #* prepare the result path
        self.behavior_policy_type = spec_tree['behavior_policy_type']
        if self.behavior_policy_type == 'random':
            behavior_type_string = '/behavior_uniform_random'
        elif self.behavior_policy_type == 'random_network':
            behavior_type_string = '/behavior_random_network'
        elif self.behavior_policy_type == 'epsilon_greedy':
            self.behavior_min_id = spec_tree['behavior_policy_range']['min']
            self.behavior_max_id = spec_tree['behavior_policy_range']['max']
            behavior_type_string = 'behavior_epsilon_greedy_'+str(self.behavior_min_id)+\
                '_'+str(self.behavior_max_id)
        else:
            self.behavior_min_id = spec_tree['behavior_policy_range']['min']
            self.behavior_max_id = spec_tree['behavior_policy_range']['max']
            behavior_type_string = '/behavior_'+str(self.behavior_min_id)+\
                '_'+str(self.behavior_max_id)

        self.debug_mode = spec_tree['debug_mode']
        self.result_path_prefix = spec_tree['save_results_to'] + behavior_type_string + '/target_{}_temp{}/n_eps{}/horizon{}/seed'.format(\
            spec_tree['target_policy_id'], self.target_temp, self.num_episodes, self.horizon)

        ensure_dir_exists(file=self.result_path_prefix)

        #* prepare the data path (if reading from an external data set)
        self.read_data_from_file = spec_tree['read_data_from_file']
        self.dataset_path_prefix = None

        if self.read_data_from_file:
            self.dataset_path_prefix = spec_tree['load_data_from'] + behavior_type_string +'/n_eps{}/horizon{}/seed'.format(\
                self.num_episodes, self.horizon)

        target_policy_net_path = spec_tree['load_model_from']+ '/policy_' +\
             str(spec_tree['target_policy_id'])+'.pt'
        # self.tf_policy_net = convert_policy_network(spec_tree,obs_space,action_space, temperature = self.target_temp, path = target_policy_net_path)
        self.target_policy_net = spec_tree.create_component(
            'model', obs_space, action_space)
        self.target_policy_net.temperature = self.target_temp
        self.target_policy_net.load_model(target_policy_net_path)
        # self.target_model_weights_list = extract_model_weights(self.target_policy_net.model, self.obs_dim, self.act_dim)
        self.target_model_weights_list = convert_torch_model_weights_to_list(
            self.target_policy_net.model)

        on_policy_num_eps = spec_tree['on_policy_eval_num_episodes']
        self.value_true = evaluate_on_policy(self.environment,
                                             self.target_policy_net,
                                             num_episodes=on_policy_num_eps,
                                             gamma=self.gamma)

        self.hidden_layers_net = self.spec_tree['model']['fcnet_hiddens']
        self.activation_net = self.spec_tree['model']['fcnet_activation']

        self.data = None
    def __init__(self, spec_tree, device):
        self.spec_tree = spec_tree
        self.device = device

        # get spec values
        self.enabled = spec_tree['enabled']
        self.behavior_policy_type = spec_tree['behavior_policy_type']
        if self.behavior_policy_type == 'random':
            self.behavior_policy_range = None
        elif self.behavior_policy_type == 'random_network':
            self.behavior_policy_range = None
        elif self.behavior_policy_type == 'range' or self.behavior_policy_type == 'range_tabular':
            behavior_policy_range = spec_tree['behavior_policy_range']
            self.behavior_policy_range = range(
                behavior_policy_range['min'], behavior_policy_range['max'] + 1,
                behavior_policy_range['step'])
        elif self.behavior_policy_type == 'epsilon_greedy':
            behavior_policy_range = spec_tree['behavior_policy_range']
            self.behavior_policy_range = range(
                behavior_policy_range['min'], behavior_policy_range['max'] + 1,
                behavior_policy_range['step'])
        elif self.behavior_policy_type == 'single':
            behavior_policy_range = spec_tree['behavior_policy_range']
            assert behavior_policy_range['min'] == behavior_policy_range['max']
            assert behavior_policy_range['step'] == 1
            self.behavior_policy_range = [behavior_policy_range['min']]
        self.temperature = spec_tree['temperature']
        self.num_episodes = spec_tree['num_episodes']
        self.save_data_to = resolve_path(spec_tree['save_data_to'])
        self.dataset_seed = spec_tree['dataset_seed']
        # self.dataset_seed = range(dataset_seed['start'], dataset_seed['stop'], dataset_seed['step'])

        self.behavior_policy_collection = []
        dummy_env = self.spec_tree.create_component('environment')
        self.obs_space = dummy_env.observation_space
        self.action_space = dummy_env.action_space
        if self.behavior_policy_type == 'range_tabular':
            for behavior_policy_id in self.behavior_policy_range:
                policy_path = self.spec_tree['load_model_from'] + 'pi' + str(
                    behavior_policy_id) + '.npy'
                q_table_path = self.spec_tree['load_model_from'] + 'q' + str(
                    behavior_policy_id) + '.npy'
                behavior_policy = np.load(policy_path)
                q_table = np.load(q_table_path)
                self.behavior_policy_collection.append(behavior_policy)
        else:
            if self.behavior_policy_range:
                assert self.behavior_policy_type is not 'random'
                assert self.behavior_policy_type is not 'random_network'
                for behavior_policy_id in self.behavior_policy_range:
                    behavior_policy = self.spec_tree.create_component(
                        'model', self.obs_space, self.action_space)
                    model_path = self.spec_tree[
                        'load_model_from'] + 'policy_' + str(
                            behavior_policy_id) + '.pt'
                    behavior_policy.load_model(model_path)
                    self.behavior_policy_collection.append(behavior_policy)
            elif self.behavior_policy_type == 'random_network':
                behavior_policy = self.spec_tree.create_component(
                    'model', self.obs_space, self.action_space)
                self.behavior_policy_collection.append(behavior_policy)

        data_path = None
        self.save_data = spec_tree['save_data']
        if self.save_data:
            if self.behavior_policy_type == 'random':
                behavior_type_string = '/behavior_uniform_random'
            elif self.behavior_policy_type == 'random_network':
                behavior_type_string = '/behavior_random_network'
            elif self.behavior_policy_type == 'epsilon_greedy':
                behavior_type_string = '/behavior_epsilon_greedy_'+str(min(self.behavior_policy_range))+\
                    '_'+str(max(self.behavior_policy_range))
            else:
                behavior_type_string = '/behavior_'+str(min(self.behavior_policy_range))+\
                    '_'+str(max(self.behavior_policy_range))
            self.data_path = spec_tree['save_data_to']+ behavior_type_string +\
                '/n_eps'+str(self.num_episodes)+'/horizon'+str(dummy_env.max_ep_len)+'/'
            ensure_dir_exists(file=self.data_path)
 def save_model_in_progress(self, save_paths_dict, policy_tag):
     save_path = save_paths_dict['save_model_to']+'_'+str(policy_tag)+".pt"
     ensure_dir_exists(file=save_path)
     torch.save(self.policy_network.state_dict(), save_path)
Beispiel #8
0
    def __init__(self, spec_tree, device):
        self.spec_tree = spec_tree
        
        # self.seed = eval('random.'+spec_tree['seed'])
        # self.seed = random.randint(spec_tree['algo_seed'])
        self.seed = spec_tree['algo_seed']

        self.dataset_seed = spec_tree['dataset_seed']
        self.use_ray = spec_tree['use_ray']

        self.ope = spec_tree['offline_estimators']
        assert isinstance(self.ope, list), 'the estimators should be described as a list'
        # self.metrics = {}
        # for estimator in self.ope:
        #     metric = Metric(short_name=estimator, long_name=estimator,formatting_string='{:5.2f}', higher_is_better=False)
        #     self.metrics[estimator] = metric
        self.horizon = spec_tree['horizon']
        self.num_episodes = spec_tree['num_episodes']
        self.gamma = spec_tree['gamma']
        self.normalization = spec_tree['normalization']
        self.target_temp = spec_tree['target_policy_temperature']

        # reset the environment seed
        spec_tree['environment']['seed'] = self.seed
        assert self.horizon == spec_tree['environment']['max_ep_len'], 'horizon and max_ep_len of environment do not match'
        assert not spec_tree['environment']['fixed_length_episode'], 'should turn fixed length episode to false for on policy evaluation purpose'
        self.environment = spec_tree.create_component('environment')
        obs_space = self.environment.observation_space
        action_space = self.environment.action_space
        
        self.obs_dim = obs_space.shape[0]
        self.act_dim = action_space.n

        # locate off-line dataset
        # dataset_path = spec_tree['load_data_from'] + '/' + self.environment.name+'/behavior_'+str(behavior_policy_min_id)+'_'+\
        #     str(behavior_policy_max_id)+'/n_eps'+str(self.num_episodes)+'/horizon'+str(self.horizon)+'/seed'+str(self.dataset_seed)+'.h5'

        # dataset_path = spec_tree['load_data_from'] + '/behavior_' + str(self.behavior_min_id) + '_' +\
        #     str(self.behavior_max_id)+'/n_eps'+str(self.num_episodes)+'/horizon'+str(self.horizon)+'/seed'+str(self.dataset_seed)+'.h5'

        target_policy_net_path = spec_tree['load_model_from']+ '/policy_' +\
             str(spec_tree['target_policy_id'])+'.pt'       

        #* prepare the data and result path
        self.behavior_min_id = spec_tree['behavior_policy_range']['min']
        self.behavior_max_id = spec_tree['behavior_policy_range']['max']

        # self.dataset_path_prefix = spec_tree['load_data_from'] + '/behavior_' + str(self.behavior_min_id) + '_' +\
        #     str(self.behavior_max_id)+'/n_eps'+str(self.num_episodes)+'/horizon'+str(self.horizon)+'/seed'
        
        self.dataset_path_prefix = spec_tree['load_data_from'] + '/behavior_{}_{}/n_eps{}/horizon{}/seed'.format(self.behavior_min_id,\
            self.behavior_max_id, self.num_episodes, self.horizon)

        # self.result_path_prefix = spec_tree['save_results_to'] + '/behavior_' + str(self.behavior_min_id) + '_' +\
        #     str(self.behavior_max_id)+'/target_'+str(spec_tree['target_policy_id'])+'_temp'+str(self.target_temp)+'/n_eps'+str(self.num_episodes)+'/horizon'+str(self.horizon)+'/seed'
        self.result_path_prefix = spec_tree['save_results_to'] + '/behavior_{}_{}/target_{}_temp{}/n_eps{}/horizon{}/seed'.format(\
            self.behavior_min_id, self.behavior_max_id, spec_tree['target_policy_id'], self.target_temp, self.num_episodes, self.horizon)
        
        ensure_dir_exists(file=self.result_path_prefix)

        # sess = make_session()
        # sess.__enter__()

        # self.tf_policy_net = convert_policy_network(spec_tree,obs_space,action_space, temperature = self.target_temp, path = target_policy_net_path)
        self.target_policy_net = spec_tree.create_component('model', obs_space, action_space)
        self.target_policy_net.temperature = self.target_temp
        self.target_policy_net.load_model(target_policy_net_path)
        # self.target_model_weights_list = extract_model_weights(self.target_policy_net.model, self.obs_dim, self.act_dim)
        self.target_model_weights_list = convert_torch_model_weights_to_list(self.target_policy_net.model)
        # diff = 0
        # pdb.set_trace()
        # for i in range(len(weights_list)):
        #     if len(weights_list[i])>0:
        #         diff += ((weights_list[i][0] - self.target_model_weights_list[i][0])**2).sum()
        #         diff += ((weights_list[i][1] - self.target_model_weights_list[i][1])**2).sum()
        # pdb.set_trace()
        # data = read_batch_experience(dataset_path, self.target_policy_net, self.num_episodes, self.target_temp, self.horizon, self.gamma)
        
        on_policy_num_eps = spec_tree['on_policy_eval_num_episodes']
        self.value_true = evaluate_on_policy(self.environment, self.target_policy_net, num_episodes = on_policy_num_eps, gamma = self.gamma)
        
        # if self.normalization == 'std_norm':
        #     #* whiten data
        #     obs_mean = np.mean(data['obs'], axis=0, keepdims = True)
        #     obs_std = np.std(data['obs'], axis = 0, keepdims = True)
        #     data['obs'] = (data['obs'] - obs_mean) / obs_std
        #     data['next_obs'] = (data['next_obs'] - obs_mean) / obs_std
        #     data['init_obs'] = (data['init_obs'] - obs_mean) / obs_std
        #     data['term_obs'] = (data['term_obs'] - obs_mean) / obs_std

        #     self.norm_performed = {'type': self.normalization, 'shift': obs_mean, 'scale': obs_std}
        # else:
        #     self.norm_performed = {'type': None, 'shift': None, 'scale': None}
        # # data['init_obs'] = data['obs'][::self.horizon]
        # # data['init_acts'] = data['acts'][::self.horizon]
        # # data['term_obs'] = data['next_obs'][self.horizon-1::self.horizon]
        # data['target_prob_init_obs'] = data['target_prob_obs'][::self.horizon]
        # data['target_prob_term_obs'] = data['target_prob_next_obs'][self.horizon-1::self.horizon]

        # data['next_acts'] = data['acts'].copy()
        # for i in range(self.num_episodes):
        #     data['next_acts'][i*self.horizon:(i+1)*self.horizon-1] = data['acts'][i*self.horizon+1:(i+1)*self.horizon].copy()
        # self.data = data
        # import pdb; pdb.set_trace()
        self.hidden_layers_net = self.spec_tree['model']['fcnet_hiddens']
        self.activation_net = self.spec_tree['model']['fcnet_activation']