Esempio n. 1
0
def run_experiment(variant, reporter):
    training_environment = (get_environment(
        'gym', 'MultiGoal', 'Default-v0', {
            'actuation_cost_coeff': 30,
            'distance_cost_coeff': 1,
            'goal_reward': 10,
            'init_sigma': 0.1,
        }))
    evaluation_environment = training_environment.copy()

    pool = SimpleReplayPool(environment=training_environment, max_size=1e6)

    sampler = SimpleSampler(max_path_length=30)

    variant['Q_params']['config'].update({
        'input_shapes': (
            training_environment.observation_shape,
            training_environment.action_shape,
        )
    })
    Qs = value_functions.get(variant['Q_params'])

    variant['policy_params']['config'].update({
        'action_range': (training_environment.action_space.low,
                         training_environment.action_space.high),
        'input_shapes':
        training_environment.observation_shape,
        'output_shape':
        training_environment.action_shape,
    })
    policy = policies.get(variant['policy_params'])

    plotter = QFPolicyPlotter(Q=Qs[0],
                              policy=policy,
                              obs_lst=np.array(((-2.5, 0.0), (0.0, 0.0),
                                                (2.5, 2.5), (-2.5, -2.5))),
                              default_action=(np.nan, np.nan),
                              n_samples=100)

    variant['algorithm_params']['config'].update({
        'training_environment': training_environment,
        'evaluation_environment': evaluation_environment,
        'policy': policy,
        'Qs': Qs,
        'pool': pool,
        'sampler': sampler,
        'min_pool_size': 100,
        'batch_size': 64,
        'plotter': plotter,
    })
    algorithm = algorithms.get(variant['algorithm_params'])

    for train_result in algorithm.train():
        reporter(**train_result)
Esempio n. 2
0
    def _build(self):
        variant = copy.deepcopy(self._variant)
        environment_params = variant['environment_params']
        training_environment = self.training_environment = (
            get_roboverse_env_from_params(environment_params['training']))
        evaluation_environment = self.evaluation_environment = (
            get_roboverse_env_from_params(environment_params['evaluation'])
            if 'evaluation' in environment_params
            else training_environment)

        from collections import OrderedDict
        changed_obs_shape = OrderedDict()
        changed_obs_shape['image'] = training_environment.observation_shape['image']
        variant['Q_params']['config'].update({
            'input_shapes': (
                #training_environment.observation_shape,
                changed_obs_shape,
                training_environment.action_shape),
        })
        Qs = self.Qs = value_functions.get(variant['Q_params'])

        variant['policy_params']['config'].update({
            'action_range': (training_environment.action_space.low,
                             training_environment.action_space.high),
            'input_shapes': changed_obs_shape, #training_environment.observation_shape,
            'output_shape': training_environment.action_shape,
        })
        policy = self.policy = policies.get(variant['policy_params'])

        variant['replay_pool_params']['config'].update({
            'environment': training_environment,
        })
        replay_pool = self.replay_pool = replay_pools.get(
            variant['replay_pool_params'])

        variant['sampler_params']['config'].update({
            'environment': training_environment,
            'policy': policy,
            'pool': replay_pool,
        })
        sampler = self.sampler = samplers.get(variant['sampler_params'])

        variant['algorithm_params']['config'].update({
            'training_environment': training_environment,
            'evaluation_environment': evaluation_environment,
            'policy': policy,
            'Qs': Qs,
            'pool': replay_pool,
            'sampler': sampler
        })
        self.algorithm = algorithms.get(variant['algorithm_params'])

        self._built = True
Esempio n. 3
0
    def _build(self):
        variant = copy.deepcopy(self._variant)
        environment_params = variant['environment_params']
        training_environment = self.training_environment = (
            get_environment_from_params(environment_params['training']))
        evaluation_environment = self.evaluation_environment = (
            get_environment_from_params(environment_params['evaluation'])
            if 'evaluation' in environment_params else training_environment)

        variant['Q_params']['config'].update({
            'input_shapes': (training_environment.observation_shape,
                             training_environment.action_shape),
        })
        Qs = self.Qs = value_functions.get(variant['Q_params'])

        variant['policy_params']['config'].update({
            'action_range': (training_environment.action_space.low,
                             training_environment.action_space.high),
            'input_shapes':
            training_environment.observation_shape,
            'output_shape':
            training_environment.action_shape,
        })
        policy = self.policy = policies.get(variant['policy_params'])

        variant['replay_pool_params']['config'].update({
            'environment':
            training_environment,
        })
        replay_pool = self.replay_pool = replay_pools.get(
            variant['replay_pool_params'])

        variant['sampler_params']['config'].update({
            'environment': training_environment,
            'policy': policy,
            'pool': replay_pool,
        })
        sampler = self.sampler = samplers.get(variant['sampler_params'])
        self.sampler.seed = variant['run_params']['seed']
        print(sampler.seed, self.sampler.seed)

        variant['algorithm_params']['config'].update({
            'training_environment': training_environment,
            'evaluation_environment': evaluation_environment,
            'policy': policy,
            'Qs': Qs,
            'pool': replay_pool,
            'sampler': sampler
        })
        self.algorithm = algorithms.get(variant['algorithm_params'])

        self._built = True
Esempio n. 4
0
def load_policy(checkpoint_dir, variant, environment):
    policy_params = variant['policy_params'].copy()
    policy_params['config'] = {
        **policy_params['config'],
        'action_range':
        (environment.action_space.low, environment.action_space.high),
        'input_shapes':
        environment.observation_shape,
        'output_shape':
        environment.action_shape,
    }

    policy = policies.get(policy_params)

    policy_save_path = ExperimentRunner._policy_save_path(checkpoint_dir)
    status = policy.load_weights(policy_save_path)
    status.assert_consumed().run_restore_ops()

    return policy
Esempio n. 5
0
    def _build(self):
        '''
        variant['something params']是关于 something 的创建参数,
        其中又包含  variant['something params']['class_name']
        和 variant['something params']['config']
        两项。
        
        用这两项可以创建一个对象实例
        '''

        variant = copy.deepcopy(self._variant)
        environment_params = variant['environment_params']
        training_environment = self.training_environment = (
            get_environment_from_params(environment_params['training']))
        evaluation_environment = self.evaluation_environment = (
            get_environment_from_params(environment_params['evaluation'])
            if 'evaluation' in environment_params else training_environment)

        variant['Q_params']['config'].update({
            'input_shapes': (training_environment.observation_shape,
                             training_environment.action_shape),
        })
        # 根据配置获取一个函数(包含神经网络)的实例
        Qs = self.Qs = tree.flatten(value_functions.get(variant['Q_params']))

        variant['policy_params']['config'].update({
            'action_range': (training_environment.action_space.low,
                             training_environment.action_space.high),
            'input_shapes':
            training_environment.observation_shape,
            'output_shape':
            training_environment.action_shape,
        })
        policy = self.policy = policies.get(variant['policy_params'])

        variant['replay_pool_params']['config'].update({
            'environment':
            training_environment,
        })
        # 参考 value_functions.get, 根据配置获取实例
        replay_pool = self.replay_pool = replay_pools.get(
            variant['replay_pool_params'])

        # 用 variant 中的下层配置创建下层对象,并用下层对象给上层配置(config)赋值
        variant['sampler_params']['config'].update({
            'environment': training_environment,
            'policy': policy,
            'pool': replay_pool,
        })
        # 用上层配置创建上层对象
        sampler = self.sampler = samplers.get(variant['sampler_params'])

        # 用 variant 中的下层配置创建下层对象,并用下层对象给上层配置赋值
        variant['algorithm_params']['config'].update({
            'training_environment': training_environment,
            'evaluation_environment': evaluation_environment,
            'policy': policy,
            'Qs': Qs,
            'pool': replay_pool,
            'sampler': sampler
        })
        # 用上层配置创建上层对象,创建 RL 算法,包含所有运算模块
        self.algorithm = algorithms.get(variant['algorithm_params'])

        self._built = True
Esempio n. 6
0
    def _build(self):
        variant = copy.deepcopy(self._variant)
        environment_params = variant['environment_params']
        training_environment = self.training_environment = (
            get_environment_from_params(environment_params['training']))
        evaluation_environment = self.evaluation_environment = (
            get_environment_from_params(environment_params['evaluation'])
            if 'evaluation' in environment_params
            else training_environment)

        variant['Q_params']['config'].update({
            'input_shapes': (
                training_environment.observation_shape,
                training_environment.action_shape),
        })
        Qs = self.Qs = value_functions.get(variant['Q_params'])

        variant['policy_params']['config'].update({
            'action_range': (training_environment.action_space.low,
                             training_environment.action_space.high),
            'input_shapes': training_environment.observation_shape,
            'output_shape': training_environment.action_shape,
        })
        policy = self.policy = policies.get(variant['policy_params'])

        variant['replay_pool_params']['config'].update({
            'environment': training_environment,
        })
        replay_pool = self.replay_pool = replay_pools.get(
            variant['replay_pool_params'])

        variant['sampler_params']['config'].update({
            'environment': training_environment,
            'policy': policy,
            'pool': replay_pool,
        })
        sampler = self.sampler = samplers.get(variant['sampler_params'])

        set_random_seed(variant['run_params']['seed'])
        save_path = os.path.join(os.path.dirname(__file__),"..","..", "results", f"logs",f"sac", f"HalfCheetahBulletEnv-v0_{variant['run_params']['seed']}")
        print("this is the save path: " + save_path)
        os.makedirs(save_path, exist_ok=True)

        # create wrapped environment
        eval_env_wrapped = TimeLimit(evaluation_environment, 1000)

        eval_callback = EvalCallback(
            eval_env_wrapped,
            callback_on_new_best=None,
            best_model_save_path=None,
            n_eval_episodes=10,
            log_path=save_path,
            eval_freq=10000,  # TODO change hardcoded value
            deterministic=True,
            verbose=1,
        )
        eval_callback.init_callback(policy)
        sampler.set_callback(eval_callback)
        variant['algorithm_params']['config'].update({
            'training_environment': training_environment,
            'evaluation_environment': evaluation_environment,
            'policy': policy,
            'Qs': Qs,
            'pool': replay_pool,
            'sampler': sampler
        })
        self.algorithm = algorithms.get(variant['algorithm_params'])

        self._built = True
Esempio n. 7
0
def main(variant_in):
    variant = copy.deepcopy(variant_in)

    environment_params = variant['environment_params']
    training_environment = get_environment_from_params(environment_params['training'])
    evaluation_environment = (
        get_environment_from_params(environment_params['evaluation'])
        if 'evaluation' in environment_params else training_environment
    )

    variant['Q_params']['config'].update({
        'input_shapes': (
            training_environment.observation_shape,
            training_environment.action_shape),
    })
    Qs = value_functions.get(variant['Q_params'])

    variant['policy_params']['config'].update({
        'action_range': (training_environment.action_space.low,
                         training_environment.action_space.high),
        'input_shapes': training_environment.observation_shape,
        'output_shape': training_environment.action_shape,
    })
    policy = policies.get(variant['policy_params'])

    variant['replay_pool_params']['config'].update({
        'environment': training_environment,
    })
    replay_pool = replay_pools.get(variant['replay_pool_params'])

    variant['sampler_params']['config'].update({
        'environment': training_environment,
        'policy': policy,
        'pool': replay_pool,
    })
    sampler = samplers.get(variant['sampler_params'])

    variant['algorithm_params']['config'].update({
        'training_environment': training_environment,
        'evaluation_environment': evaluation_environment,
        'policy': policy,
        'Qs': Qs,
        'pool': replay_pool,
        'sampler': sampler
    })
    algorithm = algorithms.get(variant['algorithm_params'])
    print("Initialization finished")

    train_generator = None
    # it will iterate through the number of epochs 'n_epochs'
    # during epoch:
    # it will sample 'epoch_length' number of times (reset is not counted) to the pool
    # also, it will train each step, if there are more samples than 'min_pool_size' in the replay pool
    for i in count():
        if train_generator is None:
            train_generator = algorithm.train()
        diagnostics = next(train_generator)

        # it should be before printing to prevent a double print the last epoch
        try:
            if diagnostics['done']:
                break
        except KeyError:
            pass

        evalu_reward = diagnostics["evaluation"]["episode-reward-mean"]
        print(f"Evaluation: reward mean is {evalu_reward}")
        # train_reward = diagnostics["training"]["episode-reward-mean"]
        # print(f"Training: reward mean is {train_reward}")

    print("Finish")
    return policy