Ejemplo n.º 1
0
    def _build(self):
        variant = copy.deepcopy(self._variant)
        print(variant)
        environment_params = variant['environment_params']
        training_environment = self.training_environment = (
            get_environment_from_params(environment_params['training']))
        evaluation_environment = self.evaluation_environment = (
            get_environment_from_params(environment_params['evaluation'])
            if 'evaluation' in environment_params else training_environment)

        replay_pool = self.replay_pool = (get_replay_pool_from_variant(
            variant, training_environment))
        sampler = self.sampler = get_sampler_from_variant(variant)
        Qs = self.Qs = get_Q_function_from_variant(variant,
                                                   training_environment)
        policy = self.policy = get_policy_from_variant(variant,
                                                       training_environment,
                                                       Qs)
        initial_exploration_policy = self.initial_exploration_policy = (
            get_policy('UniformPolicy', training_environment))

        self.algorithm = get_algorithm_from_variant(
            variant=self._variant,
            training_environment=training_environment,
            evaluation_environment=evaluation_environment,
            policy=policy,
            initial_exploration_policy=initial_exploration_policy,
            Qs=Qs,
            pool=replay_pool,
            sampler=sampler,
            session=self._session)

        initialize_tf_variables(self._session, only_uninitialized=True)

        self._built = True
Ejemplo n.º 2
0
    def _build(self):
        """
        called by tune to build algorithm 
        """
        variant = copy.deepcopy(self._variant)

        environment_params = variant['environment_params']
        training_environment = self.training_environment = (
            get_environment_from_params(environment_params['training']))
        mjc_model_environment = self.mjc_model_environment = (
            get_environment_from_params(environment_params['training']))
        evaluation_environment = self.evaluation_environment = (
            get_environment_from_params(environment_params['evaluation'])
            if 'evaluation' in environment_params else training_environment)

        replay_pool = self.replay_pool = (get_replay_pool_from_variant(
            variant, training_environment))
        sampler = self.sampler = get_sampler_from_variant(variant)
        Qs = self.Qs = get_Q_function_from_variant(variant,
                                                   training_environment)
        policy = self.policy = get_policy_from_variant(variant,
                                                       training_environment,
                                                       Qs, self._session)
        initial_exploration_policy = self.initial_exploration_policy = (
            get_policy('UniformPolicy', training_environment))

        #### get termination function
        domain = environment_params['training']['domain']
        static_fns = mbpo.static[domain.lower()]
        ####

        #### build algorithm
        self.algorithm = get_algorithm_from_variant(
            variant=self._variant,
            training_environment=training_environment,
            evaluation_environment=evaluation_environment,
            mjc_model_environment=mjc_model_environment,
            policy=policy,
            initial_exploration_policy=initial_exploration_policy,
            Qs=Qs,
            pool=replay_pool,
            static_fns=static_fns,
            sampler=sampler,
            session=self._session)

        initialize_tf_variables(self._session, only_uninitialized=True)

        # add graph since ray doesn't seem to automatically add that
        graph_writer = tf.summary.FileWriter(self.logdir, self._session.graph)
        graph_writer.flush()
        graph_writer.close()

        #### finalize graph
        # tf.get_default_graph().finalize() ### good for debugging, but interferes with Qs on SAC
        self._built = True
Ejemplo n.º 3
0
    def _build(self):
        variant = copy.deepcopy(self._variant)
        environment_params = variant['environment_params']
        training_environment = self.training_environment = (
            get_environment_from_params(environment_params['training']))
        evaluation_environment = self.evaluation_environment = (
            get_environment_from_params(environment_params['evaluation'])
            if 'evaluation' in environment_params else training_environment)

        variant['Q_params']['config'].update({
            'input_shapes': (training_environment.observation_shape,
                             training_environment.action_shape),
        })
        Qs = self.Qs = value_functions.get(variant['Q_params'])

        variant['policy_params']['config'].update({
            'action_range': (training_environment.action_space.low,
                             training_environment.action_space.high),
            'input_shapes':
            training_environment.observation_shape,
            'output_shape':
            training_environment.action_shape,
        })
        policy = self.policy = policies.get(variant['policy_params'])

        variant['replay_pool_params']['config'].update({
            'environment':
            training_environment,
        })
        replay_pool = self.replay_pool = replay_pools.get(
            variant['replay_pool_params'])

        variant['sampler_params']['config'].update({
            'environment': training_environment,
            'policy': policy,
            'pool': replay_pool,
        })
        sampler = self.sampler = samplers.get(variant['sampler_params'])
        self.sampler.seed = variant['run_params']['seed']
        print(sampler.seed, self.sampler.seed)

        variant['algorithm_params']['config'].update({
            'training_environment': training_environment,
            'evaluation_environment': evaluation_environment,
            'policy': policy,
            'Qs': Qs,
            'pool': replay_pool,
            'sampler': sampler
        })
        self.algorithm = algorithms.get(variant['algorithm_params'])

        self._built = True
def load_policy(path):
    with open(path, "rb") as f:
        checkpoint = pickle.load(f)

    variant = checkpoint["variant"]
    env_params = variant["environment_params"]["training"]
    alice_params = variant["alice"]
    bob_params = variant["bob"]
    num_skills = alice_params["algorithm_params"]["discriminator_params"][
        "num_skills"]

    # bob policy
    env = get_environment_from_params(env_params)
    bob_policy = get_policy_from_variant(bob_params, env)
    bob_policy.set_weights(checkpoint["policy_weights"]["bob"])
    bob_policy._deterministic = True

    # alice policy
    env._observation_space.spaces["diayn"] = gym.spaces.Box(
        low=np.repeat(0, num_skills),
        high=np.repeat(1, num_skills),
    )
    env.observation_keys += ("diayn", )

    alice_policy = get_policy_from_variant(alice_params, env)
    alice_policy.set_weights(checkpoint["policy_weights"]["alice"])
    alice_policy._deterministic = True

    return env, alice_policy, bob_policy, num_skills
Ejemplo n.º 5
0
def init_policy():
    session = tf.keras.backend.get_session()
    checkpoint_path = CHECKPOINT_PATH.rstrip('/')
    experiment_path = os.path.dirname(checkpoint_path)

    variant_path = os.path.join(experiment_path, 'params.pkl')
    with open(variant_path, 'rb') as f:
        variant = pickle.load(f)

    with session.as_default():
        pickle_path = os.path.join(checkpoint_path, 'checkpoint.pkl')
        with open(pickle_path, 'rb') as f:
            picklable = pickle.load(f)

    environment_params = (variant['environment_params']['evaluation']
                          if 'evaluation' in variant['environment_params'] else
                          variant['environment_params']['training'])
    environment_params['n_parallel_envs'] = 1
    evaluation_environment = get_environment_from_params(environment_params)
    policy = get_policy_from_variant(variant, evaluation_environment)
    policy.set_weights(picklable['policy_weights'])

    Qs = get_Q_function_from_variant(variant, evaluation_environment)
    for i, Q in enumerate(Qs):
        Qs[i].load_weights(os.path.join(checkpoint_path, 'Qs_{}'.format(i)))

    return policy, Qs
Ejemplo n.º 6
0
def load_environment(variant):
    environment_params = (variant['environment_params']['training']
                          if 'evaluation' in variant['environment_params'] else
                          variant['environment_params']['training'])

    environment = get_environment_from_params(environment_params)
    return environment
Ejemplo n.º 7
0
def get_ddl_goal_state_from_variant(variant):
    train_env_params = variant['environment_params']['training']
    env = get_environment_from_params(train_env_params)

    universe = train_env_params['universe']
    domain = train_env_params['domain']
    task = train_env_params['task']

    if task in ['Valve3PickupFixed-v0']:
        try:
            env_path = os.path.join(
                goal_directory,
                GOAL_PATH_PER_UNIVERSE_DOMAIN_TASK[universe][domain][task])
            pkl_path = os.path.join(env_path, 'positives.pkl')
            with open(pkl_path, 'rb') as f:
                goal_state = pickle.load(f)
        except KeyError:
            raise NotImplementedError
    else:
        domain_generators = SUPPORTED_ENVS_UNIVERSE_DOMAIN_TASK[universe][
            domain]
        gen_func = domain_generators.get(task,
                                         domain_generators[DEFAULT_TASK_KEY])

        goal_state = gen_func(env,
                              include_transitions=False,
                              num_total_examples=1,
                              goal_threshold=0.0)
    goal_state = {key: val[0] for key, val in goal_state.items()}
    return goal_state
Ejemplo n.º 8
0
def simulate_policy(args):
    session = tf.keras.backend.get_session()
    checkpoint_path = args.checkpoint_path.rstrip('/')
    experiment_path = os.path.dirname(checkpoint_path)

    variant_path = os.path.join(experiment_path, 'params.pkl')
    with open(variant_path, 'rb') as f:
        variant = pickle.load(f)

    with session.as_default():
        pickle_path = os.path.join(checkpoint_path, 'checkpoint.pkl')
        with open(pickle_path, 'rb') as f:
            picklable = pickle.load(f)

    environment_params = (variant['environment_params']['evaluation']
                          if 'evaluation' in variant['environment_params'] else
                          variant['environment_params']['training'])
    evaluation_environment = get_environment_from_params(environment_params)

    evaluation_environment.seed(variant['run_params']['seed'])

    if args.record_video:
        video_dir = os.path.join(experiment_path, 'test-video')
        evaluation_environment._env = wrappers.Monitor(
            evaluation_environment._env, video_dir, force=True)

    policy = (get_policy_from_variant(variant, evaluation_environment))
    policy.set_weights(picklable['policy_weights'])

    render_kwargs = {**DEFAULT_RENDER_KWARGS, **args.render_kwargs}

    with policy.set_deterministic(args.deterministic):
        paths = rollouts(args.num_rollouts,
                         evaluation_environment,
                         policy,
                         path_length=args.max_path_length,
                         render_kwargs=render_kwargs)

        if not args.record_video:
            evaluation_metrics = evaluate_rollouts(paths,
                                                   evaluation_environment)
            evaluation_file_path = os.path.join(experiment_path,
                                                'final_eval.csv')
            with open(evaluation_file_path, 'w') as f:
                w = csv.DictWriter(f, evaluation_metrics.keys())
                w.writeheader()
                w.writerow(evaluation_metrics)

    if args.render_kwargs.get('mode') == 'rgb_array':
        fps = 1 // getattr(evaluation_environment, 'dt', 1 / 30)
        for i, path in enumerate(paths):
            video_save_dir = os.path.expanduser('/tmp/simulate_policy/')
            video_save_path = os.path.join(video_save_dir, f'episode_{i}.mp4')
            save_video(path['images'], video_save_path, fps=fps)

    return paths
Ejemplo n.º 9
0
    def build(self):
        environment_params = self.variant['environment_params']
        training_environment = self.training_environment = (
            get_environment_from_params(environment_params['training']))
        evaluation_environment = self.evaluation_environment = (
            get_environment_from_params(environment_params['evaluation'])
            if 'evaluation' in environment_params else training_environment)

        replay_pool = self.replay_pool = (get_replay_pool_from_variant(
            self.variant, training_environment))
        sampler = self.sampler = get_sampler_from_variant(self.variant)
        Qs = self.Qs = get_Q_function_from_variant(self.variant,
                                                   training_environment)
        policy = self.policy = get_policy_from_variant(self.variant,
                                                       training_environment,
                                                       Qs)
        initial_exploration_policy = self.initial_exploration_policy = (
            get_policy('UniformPolicy', training_environment))

        #### get termination function
        domain = environment_params['training']['domain']
        static_fns = static[domain.lower()]
        ####

        log_path = './log/%s' % (self.variant['algorithm_params']['domain'])
        if (not os.path.exists(log_path)):
            os.makedirs(log_path)

        self.algorithm = get_algorithm_from_variant(
            variant=self.variant,
            training_environment=training_environment,
            evaluation_environment=evaluation_environment,
            policy=policy,
            initial_exploration_policy=initial_exploration_policy,
            Qs=Qs,
            pool=replay_pool,
            static_fns=static_fns,
            sampler=sampler,
            session=self._session,
            log_file='./log/%s/%d.log' %
            (self.variant['algorithm_params']['domain'], time.time()))

        initialize_tf_variables(self._session, only_uninitialized=True)
Ejemplo n.º 10
0
    def _build(self):
        variant = copy.deepcopy(self._variant)

        environment_params = variant['environment_params']
        training_environment = self.training_environment = (
            get_environment_from_params(environment_params['training']))
        evaluation_environment = self.evaluation_environment = (
            get_environment_from_params(environment_params['evaluation'])
            if 'evaluation' in environment_params
            else training_environment)
        
        seed = variant['run_params']['seed']
        
        training_environment.seed(seed)
        
        # Set a different seed for the evaluation env
        # to ensure the policy is not just memorizing action sequences for seen initial states
        evaluation_environment.seed(seed + 10)

        replay_pool = self.replay_pool = (
            get_replay_pool_from_variant(variant, training_environment))
        sampler = self.sampler = get_sampler_from_variant(variant)
        Qs = self.Qs = get_Q_function_from_variant(
            variant, training_environment)
        policy = self.policy = get_policy_from_variant(
            variant, training_environment, Qs)
        initial_exploration_policy = self.initial_exploration_policy = (
            get_policy('UniformPolicy', training_environment))

        self.algorithm = get_algorithm_from_variant(
            variant=self._variant,
            training_environment=training_environment,
            evaluation_environment=evaluation_environment,
            policy=policy,
            initial_exploration_policy=initial_exploration_policy,
            Qs=Qs,
            pool=replay_pool,
            sampler=sampler,
            session=self._session)

        initialize_tf_variables(self._session, only_uninitialized=True)

        self._built = True
Ejemplo n.º 11
0
def load_policy_and_environment(picklable, variant):
    environment_params = (variant['environment_params']['training']
                          if 'evaluation' in variant['environment_params'] else
                          variant['environment_params']['training'])

    environment = get_environment_from_params(environment_params)

    policy = get_policy_from_variant(variant, environment)
    policy.set_weights(picklable['policy_weights'])

    return policy, environment
Ejemplo n.º 12
0
def load_policy(fpath, itr='last', deterministic=False):

    # handle which epoch to load from
    if itr=='last':
        saves = [int(x[11:]) for x in os.listdir(fpath) if 'simple_save' in x and len(x)>11]
        itr = '%d'%max(saves) if len(saves) > 0 else ''
    else:
        itr = '%d'%itr

    # load the things!
    gpu_options = tf.GPUOptions(allow_growth=True)
    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
    tf.keras.backend.set_session(sess)
    sess = tf.keras.backend.get_session()
    
    #sess = tf.Session(graph=tf.Graph())
    
    saver = Saver()
    model = saver.restore_tf_graph(sess, fpath)

    # get the correct op for executing actions
    if deterministic and 'mu' in model.keys():
        # 'deterministic' is only a valid option for SAC policies
        print('Using deterministic action op.')
        action_op = model['mu']
    else:
        print('Using default action op.')
        action_op = model['pi']

    # make function for producing an action given a single state
    get_action = lambda x : sess.run(action_op, feed_dict={model['x']: x})

    # try to load environment from save
    # (sometimes this will fail because the environment could not be pickled)
    try:
        state = joblib.load(osp.join(fpath, 'vars'+itr+'.pkl'))
        env = state['env']
    except:
        environment_params = {}
        environment_params['universe'] = 'gym'
        environment_params['task'] = 'v2'
        environment_params['domain'] = 'HumanoidSafe'
        environment_params['kwargs'] = {}
        env = get_environment_from_params(environment_params)
        # env = wrappers.Monitor(env, '/home/uvday/ray_mbpo/AntSafe/', force = True)

    return env, get_action, sess
Ejemplo n.º 13
0
def get_ddl_goal_state_from_variant(variant):
    train_env_params = variant['environment_params']['training']
    env = get_environment_from_params(train_env_params)

    universe = train_env_params['universe']
    domain = train_env_params['domain']
    task = train_env_params['task']

    domain_generators = SUPPORTED_ENVS_UNIVERSE_DOMAIN_TASK[universe][domain]
    gen_func = domain_generators.get(task, domain_generators[DEFAULT_TASK_KEY])

    goal_state = gen_func(env,
                          include_transitions=False,
                          num_total_examples=1,
                          goal_threshold=0.0)
    goal_state = {key: val[0] for key, val in goal_state.items()}
    return goal_state
Ejemplo n.º 14
0
def get_policy(checkpoint_path):
    checkpoint_path = checkpoint_path.rstrip('/')
    experiment_path = os.path.dirname(checkpoint_path)

    variant_path = os.path.join(experiment_path, 'params.json')
    with open(variant_path, 'r') as f:
        variant = json.load(f)

    environment_params = (
        variant['environment_params']['evaluation']
        if 'evaluation' in variant['environment_params']
        else variant['environment_params']['training'])
    evaluation_environment = get_environment_from_params(environment_params)

    policy = (get_policy_from_variant(variant, evaluation_environment, Qs=[None]))
    training_environment = get_environment_from_params_custom(environment_params)

    return policy, training_environment
Ejemplo n.º 15
0
def simulate_policy(args):
    session = tf.keras.backend.get_session()
    checkpoint_path = args.checkpoint_path.rstrip('/')
    experiment_path = os.path.dirname(checkpoint_path)

    variant_path = os.path.join(experiment_path, 'params.json')
    with open(variant_path, 'r') as f:
        variant = json.load(f)

    with session.as_default():
        pickle_path = os.path.join(checkpoint_path, 'checkpoint.pkl')
        with open(pickle_path, 'rb') as f:
            picklable = pickle.load(f)

    environment_params = (variant['environment_params']['evaluation']
                          if 'evaluation' in variant['environment_params'] else
                          variant['environment_params']['training'])
    evaluation_environment = get_environment_from_params(environment_params)

    policy = (get_policy_from_variant(variant,
                                      evaluation_environment,
                                      Qs=[None]))
    policy.set_weights(picklable['policy_weights'])

    with policy.set_deterministic(args.deterministic):
        paths = rollouts(args.num_rollouts,
                         evaluation_environment,
                         policy,
                         path_length=args.max_path_length,
                         render_mode=args.render_mode)

    #### print rewards
    rewards = [path['rewards'].sum() for path in paths]
    print('Rewards: {}'.format(rewards))
    print('Mean: {}'.format(np.mean(rewards)))
    ####

    if args.render_mode != 'human':
        from pprint import pprint
        import pdb
        pdb.set_trace()
        pass

    return paths
Ejemplo n.º 16
0
def simulate_policy(args):
    session = tf.keras.backend.get_session()
    checkpoint_path = args.checkpoint_path.rstrip('/')
    experiment_path = os.path.dirname(checkpoint_path)

    variant_path = os.path.join(experiment_path, 'params.pkl')
    with open(variant_path, 'rb') as f:
        variant = pickle.load(f)

    with session.as_default():
        pickle_path = os.path.join(checkpoint_path, 'checkpoint.pkl')
        with open(pickle_path, 'rb') as f:
            picklable = pickle.load(f)

    environment_params = (
        variant['environment_params']['evaluation']
        if 'evaluation' in variant['environment_params']
        else variant['environment_params']['training'])
    evaluation_environment = get_environment_from_params(environment_params)

    policy = (
        get_policy_from_variant(variant, evaluation_environment))
    policy.set_weights(picklable['policy_weights'])

    render_kwargs = {**DEFAULT_RENDER_KWARGS, **args.render_kwargs}

    with policy.set_deterministic(args.deterministic):
        paths = rollouts(args.num_rollouts,
                         evaluation_environment,
                         policy,
                         path_length=args.max_path_length,
                         render_kwargs=render_kwargs)

    if args.render_kwargs.get('mode') == 'rgb_array':
        for i, path in enumerate(paths):
            video_save_dir = os.path.expanduser('/tmp/simulate_policy/')
            video_save_path = os.path.join(video_save_dir, f'episode_{i}.avi')
            save_video(path['images'], video_save_path)

    return paths
Ejemplo n.º 17
0
    def _build(self):
        variant = copy.deepcopy(self._variant)
        print(variant.keys())
        env = self.env = get_environment_from_params(
            variant['environment_params']['training'])
        replay_pool = self.replay_pool = (get_replay_pool_from_variant(
            variant, env))
        sampler = self.sampler = get_sampler_from_variant(variant)
        Qs = self.Qs = get_Q_function_from_variant(variant, env)
        policy = self.policy = get_policy_from_variant(variant, env, Qs)
        initial_exploration_policy = self.initial_exploration_policy = (
            get_policy('UniformPolicy', env))

        algorithm_kwargs = {
            'variant': self._variant,
            'env': self.env,
            'policy': policy,
            'initial_exploration_policy': initial_exploration_policy,
            'Qs': Qs,
            'pool': replay_pool,
            'sampler': sampler,
            'session': self._session,
        }

        if self._variant['algorithm_params']['type'] in CLASSIFIER_RL_ALGS:
            reward_classifier = self.reward_classifier \
                = get_reward_classifier_from_variant(self._variant, env)
            algorithm_kwargs['classifier'] = reward_classifier

            goal_examples_train, goal_examples_validation = \
                get_goal_example_from_variant(variant)
            algorithm_kwargs['goal_examples'] = goal_examples_train
            algorithm_kwargs['goal_examples_validation'] = \
                goal_examples_validation

        self.algorithm = get_algorithm_from_variant(**algorithm_kwargs)

        initialize_tf_variables(self._session, only_uninitialized=True)

        self._built = True
Ejemplo n.º 18
0
def get_goal_transitions_from_variant(variant):
    """
    Returns SQIL goal transitions (s, a, s', r = 1)
    """
    train_env_params = variant['environment_params']['training']

    env = get_environment_from_params(train_env_params)

    universe = train_env_params['universe']
    domain = train_env_params['domain']
    task = train_env_params['task']

    try:
        # TODO: Add goal generation kwargs (goal threshold, etc.)
        domain_generators = SUPPORTED_ENVS_UNIVERSE_DOMAIN_TASK[universe][
            domain]
        gen_func = domain_generators.get(task,
                                         domain_generators[DEFAULT_TASK_KEY])
        goal_transitions = gen_func(env, include_transitions=True)
    except KeyError:
        raise NotImplementedError

    return goal_transitions
Ejemplo n.º 19
0
def main():
    import sys
    example_args = get_parser().parse_args(sys.argv[1:])

    variant_spec = get_variant_spec(example_args)
    command_line_args = example_args
    print('vriant spec: {}'.format(variant_spec))
    params = variant_spec.get('algorithm_params')
    local_dir = os.path.join(params.get('log_dir'), params.get('domain'))

    resources_per_trial = _normalize_trial_resources(
        command_line_args.resources_per_trial, command_line_args.trial_cpus,
        command_line_args.trial_gpus, command_line_args.trial_extra_cpus,
        command_line_args.trial_extra_gpus)
    experiment_id = params.get('exp_name')

    #### add pool_load_max_size to experiment_id
    if 'pool_load_max_size' in variant_spec['algorithm_params']['kwargs']:
        max_size = variant_spec['algorithm_params']['kwargs'][
            'pool_load_max_size']
        experiment_id = '{}_{}e3'.format(experiment_id, int(max_size / 1000))
    ####

    variant_spec = add_command_line_args_to_variant_spec(
        variant_spec, command_line_args)

    if command_line_args.video_save_frequency is not None:
        assert 'algorithm_params' in variant_spec
        variant_spec['algorithm_params']['kwargs']['video_save_frequency'] = (
            command_line_args.video_save_frequency)

    variant = variant_spec
    # init
    set_seed(variant['run_params']['seed'])
    gpu_options = tf.GPUOptions(allow_growth=True)
    session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
    tf.keras.backend.set_session(session)

    # build
    variant = copy.deepcopy(variant)

    tester.set_hyper_param(**variant)
    tester.add_record_param(['run_params.seed', 'info'])
    tester.configure(task_name='policy_learn',
                     private_config_path=os.path.join(get_package_path(),
                                                      'rla_config.yaml'),
                     run_file='main.py',
                     log_root=get_package_path())
    tester.log_files_gen()
    tester.print_args()

    environment_params = variant['environment_params']
    training_environment = (get_environment_from_params(
        environment_params['training']))
    evaluation_environment = (get_environment_from_params(
        environment_params['evaluation'](variant)) if 'evaluation'
                              in environment_params else training_environment)

    replay_pool = (get_replay_pool_from_variant(variant, training_environment))
    sampler = get_sampler_from_variant(variant)
    Qs = get_Q_function_from_variant(variant, training_environment)
    policy = get_policy_from_variant(variant, training_environment, Qs)
    initial_exploration_policy = (get_policy('UniformPolicy',
                                             training_environment))

    #### get termination function
    domain = environment_params['training']['domain']
    static_fns = mopo.static[domain.lower()]
    ####
    print("[ DEBUG ] KWARGS: {}".format(variant['algorithm_params']['kwargs']))

    algorithm = get_algorithm_from_variant(
        variant=variant,
        training_environment=training_environment,
        evaluation_environment=evaluation_environment,
        policy=policy,
        initial_exploration_policy=initial_exploration_policy,
        Qs=Qs,
        pool=replay_pool,
        static_fns=static_fns,
        sampler=sampler,
        session=session)
    print('[ DEBUG ] finish construct model, start training')
    # train
    list(algorithm.train())
Ejemplo n.º 20
0
    def _build(self):
        '''
        variant['something params']是关于 something 的创建参数,
        其中又包含  variant['something params']['class_name']
        和 variant['something params']['config']
        两项。
        
        用这两项可以创建一个对象实例
        '''

        variant = copy.deepcopy(self._variant)
        environment_params = variant['environment_params']
        training_environment = self.training_environment = (
            get_environment_from_params(environment_params['training']))
        evaluation_environment = self.evaluation_environment = (
            get_environment_from_params(environment_params['evaluation'])
            if 'evaluation' in environment_params else training_environment)

        variant['Q_params']['config'].update({
            'input_shapes': (training_environment.observation_shape,
                             training_environment.action_shape),
        })
        # 根据配置获取一个函数(包含神经网络)的实例
        Qs = self.Qs = tree.flatten(value_functions.get(variant['Q_params']))

        variant['policy_params']['config'].update({
            'action_range': (training_environment.action_space.low,
                             training_environment.action_space.high),
            'input_shapes':
            training_environment.observation_shape,
            'output_shape':
            training_environment.action_shape,
        })
        policy = self.policy = policies.get(variant['policy_params'])

        variant['replay_pool_params']['config'].update({
            'environment':
            training_environment,
        })
        # 参考 value_functions.get, 根据配置获取实例
        replay_pool = self.replay_pool = replay_pools.get(
            variant['replay_pool_params'])

        # 用 variant 中的下层配置创建下层对象,并用下层对象给上层配置(config)赋值
        variant['sampler_params']['config'].update({
            'environment': training_environment,
            'policy': policy,
            'pool': replay_pool,
        })
        # 用上层配置创建上层对象
        sampler = self.sampler = samplers.get(variant['sampler_params'])

        # 用 variant 中的下层配置创建下层对象,并用下层对象给上层配置赋值
        variant['algorithm_params']['config'].update({
            'training_environment': training_environment,
            'evaluation_environment': evaluation_environment,
            'policy': policy,
            'Qs': Qs,
            'pool': replay_pool,
            'sampler': sampler
        })
        # 用上层配置创建上层对象,创建 RL 算法,包含所有运算模块
        self.algorithm = algorithms.get(variant['algorithm_params'])

        self._built = True
Ejemplo n.º 21
0
from softlearning.environments.utils import get_environment_from_params
import gym

params = {
    'universe': 'gym',
    'domain': 'Point2D',
    'task': 'Fixed-v0',
    'kwargs': {
        'normalize': False,
        'init_pos_range': ((0, 0), (0, 0)),
        'target_pos_range': ((-2, -2), (2, 2)),
        'observation_keys': ('state_observation', 'state_desired_goal'),
    }
}

env = get_environment_from_params(params)

# for _ in range(100):
env.reset()
for _ in range(10):
    env.step(env.action_space.sample())
    env.render()
Ejemplo n.º 22
0
def main(variant_in):
    variant = copy.deepcopy(variant_in)

    environment_params = variant['environment_params']
    training_environment = get_environment_from_params(environment_params['training'])
    evaluation_environment = (
        get_environment_from_params(environment_params['evaluation'])
        if 'evaluation' in environment_params else training_environment
    )

    variant['Q_params']['config'].update({
        'input_shapes': (
            training_environment.observation_shape,
            training_environment.action_shape),
    })
    Qs = value_functions.get(variant['Q_params'])

    variant['policy_params']['config'].update({
        'action_range': (training_environment.action_space.low,
                         training_environment.action_space.high),
        'input_shapes': training_environment.observation_shape,
        'output_shape': training_environment.action_shape,
    })
    policy = policies.get(variant['policy_params'])

    variant['replay_pool_params']['config'].update({
        'environment': training_environment,
    })
    replay_pool = replay_pools.get(variant['replay_pool_params'])

    variant['sampler_params']['config'].update({
        'environment': training_environment,
        'policy': policy,
        'pool': replay_pool,
    })
    sampler = samplers.get(variant['sampler_params'])

    variant['algorithm_params']['config'].update({
        'training_environment': training_environment,
        'evaluation_environment': evaluation_environment,
        'policy': policy,
        'Qs': Qs,
        'pool': replay_pool,
        'sampler': sampler
    })
    algorithm = algorithms.get(variant['algorithm_params'])
    print("Initialization finished")

    train_generator = None
    # it will iterate through the number of epochs 'n_epochs'
    # during epoch:
    # it will sample 'epoch_length' number of times (reset is not counted) to the pool
    # also, it will train each step, if there are more samples than 'min_pool_size' in the replay pool
    for i in count():
        if train_generator is None:
            train_generator = algorithm.train()
        diagnostics = next(train_generator)

        # it should be before printing to prevent a double print the last epoch
        try:
            if diagnostics['done']:
                break
        except KeyError:
            pass

        evalu_reward = diagnostics["evaluation"]["episode-reward-mean"]
        print(f"Evaluation: reward mean is {evalu_reward}")
        # train_reward = diagnostics["training"]["episode-reward-mean"]
        # print(f"Training: reward mean is {train_reward}")

    print("Finish")
    return policy
Ejemplo n.º 23
0
def get_goal_example_from_variant(variant):
    train_env_params = variant['environment_params']['training']

    env = get_environment_from_params(train_env_params)
    total_goal_examples = (
        variant['data_params']['n_goal_examples'] +
        variant['data_params']['n_goal_examples_validation_max'])

    universe = train_env_params['universe']
    domain = train_env_params['domain']
    task = train_env_params['task']

    if task in DOOR_TASKS:
        goal_examples = generate_door_goal_examples(total_goal_examples, env)
    elif task in PUSH_TASKS:
        goal_examples = generate_push_goal_examples(total_goal_examples, env)
    elif task in PICK_TASKS:
        goal_examples = generate_pick_goal_examples(total_goal_examples, env,
                                                    variant['task'])
    elif SUPPORTED_ENVS_UNIVERSE_DOMAIN_TASK.get(universe,
                                                 {}).get(domain, None):
        domain_generators = SUPPORTED_ENVS_UNIVERSE_DOMAIN_TASK[universe][
            domain]
        gen_func = domain_generators.get(task,
                                         domain_generators[DEFAULT_TASK_KEY])
        include_transitions = (
            variant['algorithm_params']['type'] == 'VICEDynamicsAware')
        goal_examples = gen_func(env,
                                 include_transitions=include_transitions,
                                 num_total_examples=total_goal_examples)
    else:
        try:
            env_path = os.path.join(
                goal_directory,
                GOAL_PATH_PER_UNIVERSE_DOMAIN_TASK[universe][domain][task])
            pkl_path = os.path.join(env_path, 'positives.pkl')
            with open(pkl_path, 'rb') as f:
                goal_examples = pickle.load(f)
        except KeyError:
            raise NotImplementedError

    n_goal_examples = variant['data_params']['n_goal_examples']
    # total_samples = len(goal_examples[next(iter(goal_examples))])

    # Shuffle the goal images before assigning training/validation
    shuffle = np.random.permutation(total_goal_examples)
    train_indices = shuffle[:n_goal_examples]
    valid_indices = shuffle[n_goal_examples:]

    goal_examples_train = dict([
        (key, {obs_key: value[obs_key][train_indices]
               for obs_key in value}) if isinstance(value, dict) else
        (key, value[train_indices]) for key, value in goal_examples.items()
    ])
    goal_examples_validation = dict([
        (key, {obs_key: value[obs_key][valid_indices]
               for obs_key in value}) if isinstance(value, dict) else
        (key, value[valid_indices]) for key, value in goal_examples.items()
    ])

    return goal_examples_train, goal_examples_validation
Ejemplo n.º 24
0
def simulate_policy(args):
    session = tf.keras.backend.get_session()
    checkpoint_path = args.checkpoint_path.rstrip('/')
    experiment_path = os.path.dirname(checkpoint_path)

    variant_path = os.path.join(experiment_path, 'params.pkl')
    with open(variant_path, 'rb') as f:
        variant = pickle.load(f)

    with session.as_default():
        pickle_path = os.path.join(checkpoint_path, 'checkpoint.pkl')
        with open(pickle_path, 'rb') as f:
            picklable = pickle.load(f)

    import ipdb
    ipdb.set_trace()
    environment_params = (variant['environment_params']['evaluation']
                          if 'evaluation' in variant['environment_params'] else
                          variant['environment_params']['training'])
    if args.use_state_estimator:
        environment_params['kwargs'].update({
            'pixel_wrapper_kwargs': {
                'pixels_only': False,
                'normalize': False,
                'render_kwargs': {
                    'width': 32,
                    'height': 32,
                    'camera_id': -1,
                }
            },
            'camera_settings': {
                'azimuth': 180,
                'distance': 0.35,
                'elevation': -55,
                'lookat': (0, 0, 0.03),
            },
        })
        # obs_keys = environment_params['kwargs'].pop('observation_keys')
        # non_object_obs_keys = [obs_key for obs_key in obs_keys if 'object' not in obs_key]
        # non_object_obs_keys.append('pixels')
        # environment_params['kwargs']['observation_keys'] = tuple(non_object_obs_keys)

    # if args.render_mode == 'human':
    #     if 'has_renderer' in environment_params['kwargs'].keys():
    #         environment_params['kwargs']['has_renderer'] = True

    # variant['environment_params']['evaluation']['task'] = 'TurnFreeValve3ResetFree-v0'
    # variant['environment_params']['evaluation']['kwargs']['reset_from_corners'] = True
    #     'reward_keys': (
    #         'object_to_target_position_distance_cost',
    #         'object_to_target_orientation_distance_cost',
    #     ),
    #     'swap_goal_upon_completion': False,
    # }
    evaluation_environment = get_environment_from_params(environment_params)

    policy = (get_policy_from_variant(variant, evaluation_environment))
    policy.set_weights(picklable['policy_weights'])
    dump_path = os.path.join(checkpoint_path, 'policy_params.pkl')
    with open(dump_path, 'wb') as f:
        pickle.dump(picklable['policy_weights'], f)

    render_kwargs = {**DEFAULT_RENDER_KWARGS, **args.render_kwargs}

    from softlearning.preprocessors.utils import get_state_estimator_preprocessor
    state_estimator = get_state_estimator_preprocessor(
        state_estimator_path=
        '/home/justinvyu/dev/softlearning-vice/softlearning/models/state_estimators/state_estimator_fixed_antialias.h5',
        num_hidden_units=256,
        num_hidden_layers=2)
    sampler_kwargs = {
        'state_estimator': state_estimator,
        'replace_state': True,
    }

    with policy.set_deterministic(args.deterministic):
        paths = rollouts(args.num_rollouts,
                         evaluation_environment,
                         policy,
                         path_length=args.max_path_length,
                         render_kwargs=render_kwargs,
                         sampler_kwargs=sampler_kwargs)

    if args.render_kwargs.get('mode') == 'rgb_array':
        fps = 2 // getattr(evaluation_environment, 'dt', 1 / 30)
        for i, path in enumerate(paths):
            video_save_dir = args.checkpoint_path
            # video_save_dir = os.path.expanduser('/tmp/simulate_policy/')
            video_save_path = os.path.join(video_save_dir, f'episode_{i}.mp4')
            save_video(path['images'], video_save_path, fps=fps)

    return paths
Ejemplo n.º 25
0
def simulate_policy(args):
    session = tf.keras.backend.get_session()
    checkpoint_path = args.checkpoint_path.rstrip('/')
    experiment_path = os.path.dirname(checkpoint_path)

    variant_path = os.path.join(experiment_path, 'params.pkl')
    with open(variant_path, 'rb') as f:
        variant = pickle.load(f)

    checkpoint_paths = [
        checkpoint_dir for checkpoint_dir in sorted(
            glob.iglob(os.path.join(experiment_path, 'checkpoint_*')),
            key=lambda d: float(d.split("checkpoint_")[1]))
    ]

    dump_dir = os.path.join(experiment_path, 'evaluations/')
    if not os.path.exists(dump_dir):
        os.makedirs(dump_dir)
    all_paths = []
    for checkpoint_dir in checkpoint_paths[::2]:

        with session.as_default():
            pickle_path = os.path.join(checkpoint_dir, 'checkpoint.pkl')
            with open(pickle_path, 'rb') as f:
                picklable = pickle.load(f)

        environment_params = (variant['environment_params']['evaluation']
                              if 'evaluation' in variant['environment_params']
                              else variant['environment_params']['training'])

        environment_params['kwargs']['device_path'] = '/dev/ttyUSB0'
        environment_params['kwargs']['camera_config'] = {
            'topic': '/kinect2_001144463747/qhd/image_color',
            'image_shape': (256, 256, 3)
        }
        environment_params['kwargs']['init_pos_range'] = list((np.array([
            0, -np.pi / 4, -np.pi / 2, -3 * np.pi / 4, -np.pi, np.pi /
            4, np.pi / 2, np.pi * 3 / 4
        ]) + (-75 * np.pi / 180)) % (2 * np.pi) - np.pi)
        environment_params['kwargs']['target_pos_range'] = [-75 * np.pi / 180]
        environment_params['kwargs']['cycle_inits'] = True

        evaluation_environment = get_environment_from_params(
            environment_params)

        policy = (get_policy_from_variant(variant, evaluation_environment))

        policy_weights = picklable['policy_weights']
        if variant['algorithm_params']['type'] in ['MultiSAC', 'MultiVICEGAN']:
            policy_weights = policy_weights[0]
        policy.set_weights(policy_weights)
        # dump_path = os.path.join(checkpoint_path, 'policy_params.pkl')
        # with open(dump_path, 'wb') as f:
        #     pickle.dump(picklable['policy_weights'], f)

        render_kwargs = {**DEFAULT_RENDER_KWARGS, **args.render_kwargs}

        with policy.set_deterministic(args.deterministic):
            paths = rollouts(args.num_rollouts,
                             evaluation_environment,
                             policy,
                             path_length=args.max_path_length,
                             render_kwargs=render_kwargs)

        if render_kwargs.get('mode') == 'rgb_array':
            fps = 2 // getattr(evaluation_environment, 'dt', 1 / 30)
            for i, path in enumerate(paths):
                # video_save_dir = os.path.expanduser('/tmp/simulate_policy/')
                video_save_path = os.path.join(checkpoint_dir,
                                               f'episode_{i}.mp4')

                save_video(path['images'], video_save_path, fps=fps)
        all_paths.append(paths)

    with open(os.path.join(dump_dir, 'evaluation_paths.pkl'), 'wb') as f:
        pickle.dump(all_paths, f)
    return paths
Ejemplo n.º 26
0
    def _build(self):
        variant = copy.deepcopy(self._variant)
        environment_params = variant['environment_params']
        training_environment = self.training_environment = (
            get_environment_from_params(environment_params['training']))
        evaluation_environment = self.evaluation_environment = (
            get_environment_from_params(environment_params['evaluation'])
            if 'evaluation' in environment_params
            else training_environment)

        variant['Q_params']['config'].update({
            'input_shapes': (
                training_environment.observation_shape,
                training_environment.action_shape),
        })
        Qs = self.Qs = value_functions.get(variant['Q_params'])

        variant['policy_params']['config'].update({
            'action_range': (training_environment.action_space.low,
                             training_environment.action_space.high),
            'input_shapes': training_environment.observation_shape,
            'output_shape': training_environment.action_shape,
        })
        policy = self.policy = policies.get(variant['policy_params'])

        variant['replay_pool_params']['config'].update({
            'environment': training_environment,
        })
        replay_pool = self.replay_pool = replay_pools.get(
            variant['replay_pool_params'])

        variant['sampler_params']['config'].update({
            'environment': training_environment,
            'policy': policy,
            'pool': replay_pool,
        })
        sampler = self.sampler = samplers.get(variant['sampler_params'])

        set_random_seed(variant['run_params']['seed'])
        save_path = os.path.join(os.path.dirname(__file__),"..","..", "results", f"logs",f"sac", f"HalfCheetahBulletEnv-v0_{variant['run_params']['seed']}")
        print("this is the save path: " + save_path)
        os.makedirs(save_path, exist_ok=True)

        # create wrapped environment
        eval_env_wrapped = TimeLimit(evaluation_environment, 1000)

        eval_callback = EvalCallback(
            eval_env_wrapped,
            callback_on_new_best=None,
            best_model_save_path=None,
            n_eval_episodes=10,
            log_path=save_path,
            eval_freq=10000,  # TODO change hardcoded value
            deterministic=True,
            verbose=1,
        )
        eval_callback.init_callback(policy)
        sampler.set_callback(eval_callback)
        variant['algorithm_params']['config'].update({
            'training_environment': training_environment,
            'evaluation_environment': evaluation_environment,
            'policy': policy,
            'Qs': Qs,
            'pool': replay_pool,
            'sampler': sampler
        })
        self.algorithm = algorithms.get(variant['algorithm_params'])

        self._built = True