コード例 #1
0
ファイル: base_wm.py プロジェクト: dhruvramani/rl-car-safety
    def train(self, world_model_path):
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())

            losses = []
            all_rewards = []
            save_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                          scope='env_model')
            saver = tf.train.Saver(var_list=save_vars)

            train_writer = tf.summary.FileWriter('./env_logs/train/',
                                                 graph=sess.graph)
            summary_op = tf.summary.merge_all()

            if (self.n_envs == 1):
                envs = make_env()()
            else:
                envs = [make_env() for i in range(self.n_envs)]
                envs = SubprocVecEnv(envs)

            for idx, states, actions, rewards, next_states, dones in tqdm(
                    self.generate_data(envs), total=self.max_ep_len):
                actions = np.array(actions)
                actions = np.reshape(actions, (-1, 1))

                if (self.has_rewards):
                    target_reward = reward_to_target(rewards)
                    loss, reward_loss, state_loss, summary, _ = sess.run(
                        [
                            self.loss, self.reward_loss, self.state_loss,
                            summary_op, self.opt
                        ],
                        feed_dict={
                            self.states_ph: states,
                            self.actions_ph: actions,
                            self.target_states: next_states,
                            self.target_rewards: target_reward
                        })
                else:
                    loss, summary, _ = sess.run(
                        [self.loss, summary_op, self.opt],
                        feed_dict={
                            self.states_ph: states,
                            self.actions_ph: actions,
                            self.target_states: next_states,
                        })

                if idx % self.log_interval == 0:
                    if (self.has_rewards):
                        print(
                            '%i => Loss : %.4f, Reward Loss : %.4f, Image Loss : %.4f'
                            % (idx, loss, reward_loss, state_loss))
                    else:
                        print('%i => Loss : %.4f' % (idx, loss))
                    saver.save(sess,
                               '{}/env_model.ckpt'.format(world_model_path))
                    print('Environment model saved')

                train_writer.add_summary(summary, idx)
            envs.close()
コード例 #2
0
ファイル: train.py プロジェクト: anthliu/rl-baselines-zoo
    def create_env(n_envs, eval_env=False):
        """
        Create the environment and wrap it if necessary
        :param n_envs: (int)
        :param eval_env: (bool) Whether is it an environment used for evaluation or not
        :return: (Union[gym.Env, VecEnv])
        :return: (gym.Env)
        """
        global hyperparams

        # Do not log eval env (issue with writing the same file)
        log_dir = None if eval_env else save_path

        if is_atari:
            if args.verbose > 0:
                print("Using Atari wrapper")
            env = make_atari_env(env_id, num_env=n_envs, seed=args.seed)
            # Frame-stacking with 4 frames
            env = VecFrameStack(env, n_stack=4)
        elif algo_ in ['dqn', 'ddpg']:
            if hyperparams.get('normalize', False):
                print("WARNING: normalization not supported yet for DDPG/DQN")
            env = gym.make(env_id)
            env.seed(args.seed)
            if env_wrapper is not None:
                env = env_wrapper(env)
        else:
            if n_envs == 1:
                env = DummyVecEnv([
                    make_env(env_id,
                             0,
                             args.seed,
                             wrapper_class=env_wrapper,
                             log_dir=log_dir)
                ])
            else:
                # env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)])
                # On most env, SubprocVecEnv does not help and is quite memory hungry
                env = DummyVecEnv([
                    make_env(env_id,
                             i,
                             args.seed,
                             log_dir=log_dir,
                             wrapper_class=env_wrapper) for i in range(n_envs)
                ])
            if normalize:
                if args.verbose > 0:
                    if len(normalize_kwargs) > 0:
                        print("Normalization activated: {}".format(
                            normalize_kwargs))
                    else:
                        print("Normalizing input and reward")
                env = VecNormalize(env, **normalize_kwargs)
        # Optional Frame-stacking
        if hyperparams.get('frame_stack', False):
            n_stack = hyperparams['frame_stack']
            env = VecFrameStack(env, n_stack)
            print("Stacking {} frames".format(n_stack))
            del hyperparams['frame_stack']
        return env
コード例 #3
0
def _train(env_id, agent, model_params, total_steps, is_evaluation=False):
    if is_evaluation:  # evaluate_policy() must only take one environment
        envs = SubprocVecEnv([make_env(env_id)])
    else:
        envs = SubprocVecEnv([make_env(env_id) for _ in range(NUM_CPU)])
    envs = VecNormalize(
        envs)  # normalize the envs during training and evaluation

    # Load pretrained model during training.
    if not is_evaluation and os.path.exists(agent + '_' + env_id):
        if agent == 'ppo2':
            model = PPO2.load(agent + '_' + env_id)
        elif agent == 'a2c':
            model = A2C.load(agent + '_' + env_id)
    else:
        if agent == 'ppo2':
            model = PPO2(MlpLstmPolicy,
                         envs,
                         nminibatches=1,
                         verbose=1,
                         **model_params)
        elif agent == 'a2c':
            model = A2C(MlpLstmPolicy, envs, verbose=1, **model_params)

    model.learn(total_timesteps=total_steps)
    return envs, model
コード例 #4
0
def create_env(n_envs, eval_env=False, no_log=False):

    global hyperparams, env_kwargs
    log_dir = None if eval_env or no_log else save_path

    if n_envs == 1:
        env = DummyVecEnv([
            make_env(env_id,
                     0,
                     seed,
                     wrapper_class=env_wrapper,
                     log_dir=log_dir,
                     env_kwargs=env_kwargs)
        ])
    else:
        env = DummyVecEnv([
            make_env(env_id,
                     wrapper_class=env_wrapper,
                     log_dir=log_dir,
                     env_kwargs=env_kwargs)
        ])
        if normalize:
            local_normalize_kwargs = {'norm_reward': False}
            env = VecNormalize(env, **local_normalize_kwargs)

    return env
コード例 #5
0
ファイル: train.py プロジェクト: bit-bots/rl-baselines3-zoo
    def create_env(n_envs, eval_env=False, no_log=False):
        """
        Create the environment and wrap it if necessary
        :param n_envs: (int)
        :param eval_env: (bool) Whether is it an environment used for evaluation or not
        :param no_log: (bool) Do not log training when doing hyperparameter optim
            (issue with writing the same file)
        :return: (Union[gym.Env, VecEnv])
        """
        global hyperparams
        global env_kwargs

        # Do not log eval env (issue with writing the same file)
        log_dir = None if eval_env or no_log else save_path

        if n_envs == 1:
            env = SubprocVecEnv(
                [make_env(env_id, 0, args.seed, wrapper_class=env_wrapper, log_dir=log_dir, env_kwargs=env_kwargs)]
            )
        else:
            # env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)])
            # On most env, SubprocVecEnv does not help and is quite memory hungry
            env = SubprocVecEnv(
                [
                    make_env(env_id, i, args.seed, log_dir=log_dir, env_kwargs=env_kwargs, wrapper_class=env_wrapper)
                    for i in range(n_envs)
                ]
            )
        if normalize:
            # Copy to avoid changing default values by reference
            local_normalize_kwargs = normalize_kwargs.copy()
            # Do not normalize reward for env used for evaluation
            if eval_env:
                if len(local_normalize_kwargs) > 0:
                    local_normalize_kwargs["norm_reward"] = False
                else:
                    local_normalize_kwargs = {"norm_reward": False}

            if args.verbose > 0:
                if len(local_normalize_kwargs) > 0:
                    print(f"Normalization activated: {local_normalize_kwargs}")
                else:
                    print("Normalizing input and reward")
            env = VecNormalize(env, **local_normalize_kwargs)

        # Optional Frame-stacking
        if hyperparams.get("frame_stack", False):
            n_stack = hyperparams["frame_stack"]
            env = VecFrameStack(env, n_stack)
            print(f"Stacking {n_stack} frames")

        if is_image_space(env.observation_space):
            if args.verbose > 0:
                print("Wrapping into a VecTransposeImage")
            env = VecTransposeImage(env)
        return env
コード例 #6
0
    def create_env(n_envs, eval_env=False):
        """
        Create the environment and wrap it if necessary
        :param n_envs: (int)
        :param eval_env: (bool) Whether is it an environment used for evaluation or not
        :return: (Union[gym.Env, VecEnv])
        """
        global hyperparams
        global env_kwargs

        # Do not log eval env (issue with writing the same file)
        log_dir = None if eval_env else save_path

        if n_envs == 1:
            env = DummyVecEnv([
                make_env(env_id,
                         0,
                         args.seed,
                         wrapper_class=env_wrapper,
                         log_dir=log_dir,
                         env_kwargs=env_kwargs)
            ])
        else:
            # env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)])
            # On most env, SubprocVecEnv does not help and is quite memory hungry
            env = DummyVecEnv([
                make_env(env_id,
                         i,
                         args.seed,
                         log_dir=log_dir,
                         env_kwargs=env_kwargs,
                         wrapper_class=env_wrapper) for i in range(n_envs)
            ])
        if normalize:
            if args.verbose > 0:
                if len(normalize_kwargs) > 0:
                    print(f"Normalization activated: {normalize_kwargs}")
                else:
                    print("Normalizing input and reward")
            env = VecNormalize(env, **normalize_kwargs)

        # Optional Frame-stacking
        if hyperparams.get('frame_stack', False):
            n_stack = hyperparams['frame_stack']
            env = VecFrameStack(env, n_stack)
            print(f"Stacking {n_stack} frames")

        if is_image_space(env.observation_space):
            if args.verbose > 0:
                print("Wrapping into a VecTransposeImage")
            env = VecTransposeImage(env)
        return env
コード例 #7
0
        def create_env(n_envs):
            """
            Create the environment and wrap it if necessary
            :param n_envs: (int)
            :return: (gym.Env)
            """
            global hyperparams

            if is_atari:
                if args.verbose > 0:
                    print("Using Atari wrapper")
                env = make_atari_env(env_id, num_env=n_envs, seed=args.seed)
                # Frame-stacking with 4 frames
                env = VecFrameStack(env, n_stack=4)
            elif args.algo in ['dqn', 'ddpg']:
                if hyperparams.get('normalize', False):
                    print(
                        "WARNING: normalization not supported yet for DDPG/DQN"
                    )
                # No env_wrapper applied for now as not using make_env()
                env = gym.make(env_id)
                env.seed(args.seed)
            else:
                if n_envs == 1:
                    env = DummyVecEnv([
                        make_env(env_id,
                                 0,
                                 args.seed,
                                 wrapper_class=env_wrapper)
                    ])
                else:
                    # env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)])
                    # On most env, SubprocVecEnv does not help and is quite memory hungry
                    env = DummyVecEnv([
                        make_env(env_id,
                                 i,
                                 args.seed,
                                 wrapper_class=env_wrapper)
                        for i in range(n_envs)
                    ])
                if normalize:
                    if args.verbose > 0:
                        print("Normalizing input and return")
                    env = VecNormalize(env, **normalize_kwargs)
            # Optional Frame-stacking
            if hyperparams.get('frame_stack', False):
                n_stack = hyperparams['frame_stack']
                env = VecFrameStack(env, n_stack)
                print("Stacking {} frames".format(n_stack))
                del hyperparams['frame_stack']
            return env
コード例 #8
0
ファイル: learn.py プロジェクト: SvenGronauer/tensorbox
def play(args, **kwargs):
    """ play mode """

    print(args.dir)
    assert args.dir, 'Please provide directory where checkpoint file is located'

    kwargs['normalize'] = True
    normed_env = U.make_env(**kwargs)  # use env.setup() after session creation to apply mean/std to obs and rewards

    model = ImpalaModel(observation_shape=normed_env.observation_space.shape,
                        n_actions=normed_env.action_space.n, learning_rate=0.01, entropy_scale=0.0)

    # max_steps = 10000
    # hooks = [tf.train.StopAtStepHook(last_step=max_steps)]  # , PyProcessHook()]

    print('Restore from:', args.dir)
    with tf.train.SingularMonitoredSession(checkpoint_dir=args.dir) as sess:

        normed_env.setup(session=sess)  # restore values for running mean/std
        print('Restored from global step:', sess.run(model.global_step))

        try:
            done = False
            obs = normed_env.reset()
            print(obs)

            while not done:
                normed_env.render()
                action, _ = model.get_action_and_prob(session=sess, observation=obs)
                obs, reward, done, info = normed_env.step(action)

        except KeyboardInterrupt:
            print('got KeyboardInterrupt')
        finally:
            pass
コード例 #9
0
ファイル: main.py プロジェクト: dkkim93/gym-wolfpack
def main(args):
    # Create directories
    if not os.path.exists("./logs"):
        os.makedirs("./logs")

    # Set logs
    log = set_log(args)

    # Create env
    env = make_env(log, args)

    # Set seeds
    random.seed(args.seed)
    np.random.seed(args.seed)
    env.seed(args.seed)

    # Visualize environment
    observations = env.reset()

    for _ in range(args.ep_max_timesteps):
        env.render()

        prey_action = env.action_space.sample()
        predator1_action = env.action_space.sample()
        predator2_action = env.action_space.sample()
        actions = [prey_action, predator1_action, predator2_action]

        observations, reward, done, _ = env.step(actions)

        if done:
            break
コード例 #10
0
def main():
    """Run DQN until the environment throws an exception."""
    env = AllowBacktracking(make_env(stack=False, scale_rew=False))
    env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    with tf.Session(config=config) as sess:
        dqn = DQN(*rainbow_models(sess,
                                  env.action_space.n,
                                  gym_space_vectorizer(env.observation_space),
                                  min_val=-200,
                                  max_val=200))
        player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 4)
        optim, optimize = dqn.optimize(learning_rate=0.0001)
        sess.run(tf.global_variables_initializer())
        dqn.train(
            num_steps=3000000,  # Make sure an exception arrives before we stop.
            player=player,
            replay_buffer=PrioritizedReplayBuffer(500000,
                                                  0.5,
                                                  0.4,
                                                  epsilon=0.1),
            optimize_op=optimize,
            train_interval=1,
            target_interval=1024,
            batch_size=16,
            min_buffer_size=20000)
コード例 #11
0
    def __init__(self, cfg):
        self.work_dir = os.getcwd()
        print(f'workspace: {self.work_dir}')

        self.cfg = cfg

        self.logger = Logger(self.work_dir,
                             save_tb=cfg.log_save_tb,
                             log_frequency=cfg.log_frequency,
                             agent=cfg.agent.name)

        utils.set_seed_everywhere(cfg.seed)
        self.device = torch.device(cfg.device)
        self.train_envs, self.test_envs = utils.make_env(cfg)

        cfg.agent.params.obs_dim = self.train_envs[0].observation_space.shape[0] + cfg.noise_dims
        cfg.agent.params.action_dim = self.train_envs[0].action_space.shape[0]
        if cfg.agent.name != 'sac':
            cfg.agent.params.num_envs = cfg.num_train_envs
        cfg.agent.params.action_range = [
            float(self.train_envs[0].action_space.low.min()),
            float(self.train_envs[0].action_space.high.max())
        ]
        self.agent = hydra.utils.instantiate(cfg.agent)
        self.agent.seq_len = cfg.seq_len

        self.replay_buffer = MultiEnvReplayBuffer((cfg.agent.params.obs_dim,),  # hard coded
                                          self.train_envs[0].action_space.shape,
                                          int(cfg.replay_buffer_capacity),
                                          self.device, num_envs=cfg.num_train_envs, seq_len=cfg.seq_len)

        self.video_recorder = VideoRecorder(
            self.work_dir if cfg.save_video else None)
        self.step = [0] * cfg.num_train_envs
コード例 #12
0
def main(_):
    env_name = XMAGICAL_EMBODIMENT_TO_ENV_NAME[FLAGS.embodiment]
    env = utils.make_env(env_name, seed=0)

    # Reward learning wrapper.
    if FLAGS.config.reward_wrapper.pretrained_path is not None:
        env = utils.wrap_learned_reward(env, FLAGS.config)

    viewer = KeyboardEnvInteractor(action_dim=env.action_space.shape[0])

    env.reset()
    obs = env.render("rgb_array")
    viewer.imshow(obs)

    i = [0]
    rews = []

    def step(action):
        obs, rew, done, info = env.step(action)
        rews.append(rew)
        if obs.ndim != 3:
            obs = env.render("rgb_array")
        if done:
            print(f"Done, score {info['eval_score']:.2f}/1.00")
            print("Episode metrics: ")
            for k, v in info["episode"].items():
                print(f"\t{k}: {v}")
            if FLAGS.exit_on_done:
                return
        i[0] += 1
        return obs

    viewer.run_loop(step)

    utils.plot_reward(rews)
コード例 #13
0
ファイル: worker.py プロジェクト: MihaiAnca13/HER-lightning
def process_func(proc_idx, params, replay_buffer, model, state_normalizer,
                 goal_normalizer, log_func):
    env = make_env(params, proc_idx)
    w = Worker(proc_idx, params, env, replay_buffer, model, state_normalizer,
               goal_normalizer, log_func)
    print(f"Spawning worker with id: {proc_idx}")
    w.loop()
コード例 #14
0
def fit():
    env, args = make_env()
    env.render()

    # Assumes the agent shares the same model
    policy = MARL_MBPO(args)
    agents = [policy for i in range(env.n)]

    rewards = []

    for time_step in tqdm(range(args.time_steps)):

        if time_step % args.maximum_episode_length == 0:
            observations = env.reset()

        # Make a copy; changed in environment transition
        initial_obs = copy.deepcopy(observations)

        actions = []
        for i, observation in enumerate(observations):
            actions.append(agents[i].action(observation))

        observations, rewards, done, _ = env.step(actions)

        # Make a copy; changed in environment transition
        next_obs = copy.deepcopy(observations)

        # Store into the buffer
        policy.model_buffer.store(initial_obs, actions, next_obs, rewards)

        env.render()

        if time_step > args.batch_size:
            policy.train()
コード例 #15
0
    def __init__(self, cfg):
        self.work_dir = os.getcwd()
        print(f'workspace: {self.work_dir}')

        self.cfg = cfg

        self.logger = Logger(self.work_dir,
                             save_tb=cfg.log_save_tb,
                             log_frequency=cfg.log_frequency,
                             agent=cfg.agent.name)

        utils.set_seed_everywhere(cfg.seed)
        self.device = torch.device(cfg.device)
        self.env = utils.make_env(cfg)

        cfg.agent.params.obs_dim = self.env.observation_space.shape[0]
        cfg.agent.params.action_dim = self.env.action_space.shape[0]
        cfg.agent.params.action_range = [
            float(self.env.action_space.low.min()),
            float(self.env.action_space.high.max())
        ]
        self.agent = hydra.utils.instantiate(cfg.agent)

        self.replay_buffer = ReplayBuffer(self.env.observation_space.shape,
                                          self.env.action_space.shape,
                                          int(cfg.replay_buffer_capacity),
                                          self.device)

        self.video_recorder = VideoRecorder(
            self.work_dir if cfg.save_video else None)
        self.step = 0
コード例 #16
0
ファイル: test_simple.py プロジェクト: liusida/thesis-bodies
def test(seed, model_filename, vec_filename, train, test, body_info=0, render=False):
    print("Testing:")
    print(f" Seed {seed}, model {model_filename} vec {vec_filename}")
    print(f" Train on {train}, test on {test}, w/ bodyinfo {body_info}")
    eval_env = utils.make_env(render=render, robot_body=test, body_info=body_info)
    eval_env = DummyVecEnv([eval_env])
    eval_env = VecNormalize.load(vec_filename, eval_env)
    eval_env.norm_reward = False

    eval_env.seed(seed)
    model = PPO.load(model_filename)

    obs = eval_env.reset()
    if render:
        eval_env.env_method("set_view")
    distance_x = 0
    # print(obs)
    total_reward = 0
    for step in range(1000):
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, done, info = eval_env.step(action)
        if done:
            break
        else:  # the last observation will be after reset, so skip the last
            distance_x = eval_env.envs[0].robot.body_xyz[0]
        total_reward += reward[0]
        if render:
            time.sleep(0.01)

    eval_env.close()
    print(f"train {train}, test {test}, body_info {body_info}, step {step}, total_reward {total_reward}, distance_x {distance_x}")
    return total_reward, distance_x
コード例 #17
0
ファイル: rc_rl.py プロジェクト: Kostis-S-Z/exploring_meta
def measure_change_through_time(path, env_name, policy, rep_params):
    env = make_env(env_name, 1, rep_params['seed'], max_path_length=rep_params['max_path_length'])
    global metrics
    metrics = ['CCA']

    sanity_task = env.sample_tasks(1)

    with torch.no_grad():
        env.set_task(sanity_task[0])
        env.seed(rep_params['seed'])
        env.reset()
        env_task = Runner(env)
        sanity_ep = env_task.run(policy, episodes=1)

    init_change_m = defaultdict(list)
    init_change_v = defaultdict(list)
    adapt_change_m = defaultdict(list)
    adapt_change_v = defaultdict(list)
    checkpoints = path + f'/model_checkpoints/'
    i = 0

    file_list = os.listdir(checkpoints)
    file_list = [file for file in file_list if 'baseline' not in file]
    models_list = {}
    for file in file_list:
        n_file = file.split('_')[-1]
        n_file = n_file.split('.')[0]
        n_file = int(n_file)
        models_list[n_file] = f'model_{n_file}.pt'

    prev_policy = policy
    for key in sorted(models_list.keys()):
        model_chckpnt = models_list[key]
        if i > 40:
            break
        i += 1

        print(f'Loading {model_chckpnt} ...')
        chckpnt_policy = DiagNormalPolicy(9, 4)
        chckpnt_policy.load_state_dict(torch.load(os.path.join(checkpoints, model_chckpnt)))
        chckpnt_policy = MAML(chckpnt_policy, lr=rep_params['inner_lr'])

        mean, variance = episode_mean_var(sanity_ep, policy, chckpnt_policy, layer=6)
        a_mean, a_variance = episode_mean_var(sanity_ep, prev_policy, chckpnt_policy, layer=6)
        init_change_m['CCA'] += [mean['CCA']]
        init_change_v['CCA'] += [variance['CCA']]
        adapt_change_m['CCA'] += [a_mean['CCA']]
        adapt_change_v['CCA'] += [a_variance['CCA']]

        prev_policy = chckpnt_policy

    for metric in metrics:
        plot_sim_across_steps(init_change_m[metric], init_change_v[metric], metric=metric,
                              title='Similarity between init and adapted (in %)')

    for metric in metrics:
        difference = [1 - x for x in adapt_change_m[metric]]
        plot_sim_across_steps(difference, adapt_change_v[metric], metric=metric,
                              title='Representation difference after each step (in %)')
コード例 #18
0
ファイル: rl_train.py プロジェクト: ruthschoebel/sdc-gym
def main():
    script_start = str(datetime.datetime.now()).replace(':',
                                                        '-').replace(' ', 'T')

    args = utils.parse_args()

    args.script_start = script_start
    args_path = Path(f'args_{script_start}.json')
    with open(args_path, 'w') as f:
        json.dump(vars(args), f, indent=4)

    utils.setup(args.use_sb3, args.debug_nans)

    eval_seed = args.seed
    if eval_seed is not None:
        eval_seed += args.num_envs

    # ---------------- TRAINING STARTS HERE ----------------

    # Set up gym environment
    env = utils.make_env(args, include_norm=True)
    # Set up model
    model = setup_model(args, env)

    callbacks = []
    utils.append_callback(callbacks, utils.create_save_callback(args))
    utils.append_callback(callbacks, utils.create_eval_callback(args))

    dry_run(model, env, int(args.warmup_steps))
    env.seed(args.seed)

    start_time = time.perf_counter()
    # Train the model (need to put at least 100k steps to
    # see something)
    model.learn(total_timesteps=int(args.steps), callback=callbacks)
    duration = time.perf_counter() - start_time
    print(f'Training took {duration} seconds.')
    # env.envs[0].plot_rewards()
    print('Number of episodes in each environment:',
          [env_.num_episodes for env_ in env.envs])

    model_fname = Path(f'sdc_model_{args.model_class.lower()}_'
                       f'{args.policy_class.lower()}_{script_start}.zip')
    model.save(str(model_fname))

    env_fname = Path(f'sdc_env_{script_start}.pkl')
    utils.save_env(env_fname, env)

    # delete trained model to demonstrate loading, not really necessary
    # del model

    # delete trained model to demonstrate loading, not really necessary
    # del model

    # ---------------- TESTING STARTS HERE ----------------

    fig_path = Path(f'results_{script_start}.pdf')

    run_tests(model, args, seed=eval_seed, fig_path=fig_path)
コード例 #19
0
def make_envs(procs, env_name, seed, extrap_min, extrap_max):
    envs = []
    for i in range(procs):
        env = utils.make_env(env_name, seed + 100000 * i, {"extrapolate_min": extrap_min, "extrapolate_max": extrap_max})
        envs.append(env)
    env = ParallelEnv(envs)
    print("Environments loaded\n")
    return env
コード例 #20
0
def main(num_games=10, load_checkpoint=False, env_name='PongNoFrameskip-v4'):
    env = make_env(env_name)
    best_score = -np.inf

    agent = DQNAgent(gamma=0.99,
                     epsilon=1.0,
                     lr=0.0001,
                     input_dims=(env.observation_space.shape),
                     n_actions=env.action_space.n,
                     mem_size=20000,
                     eps_min=0.1,
                     batch_size=32,
                     replace=1000,
                     eps_dec=1e-5,
                     chkpt_dir='models/',
                     algo='DQNAgent',
                     env_name=env_name)
    if load_checkpoint:
        agent.load_models()
    fname = agent.algo + '_' + agent.env_name + '_lr' + str(agent.lr) +'_' \
            + str(num_games) + 'games'
    figure_file = 'plots/' + fname + '.png'
    n_steps = 0
    scores, eps_history, steps_array = [], [], []

    for i in range(num_games):
        done = False
        observation = env.reset()
        score = 0
        while not done:
            action = agent.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            score += reward
            if not load_checkpoint:
                agent.store_transition(observation, action, reward,
                                       observation_, int(done))
                agent.learn()

            observation = observation_
        scores.append(score)
        steps_array.append(n_steps)
        avg_score = np.mean(scores[-100:])
        print('episode: ', i, 'score: ', score,
              ' average score %.1f' % avg_score,
              'best score %.2f' % best_score, 'epsilon %.2f' % agent.epsilon,
              'steps', n_steps)

        if avg_score > best_score:
            #if not load_checkpoint:
            #    agent.save_models()
            best_score = avg_score

        eps_history.append(agent.epsilon)
        if load_checkpoint and n_steps >= 18000:
            break

    x = [i + 1 for i in range(len(scores))]
    plot_learning_curve(steps_array, scores, eps_history, figure_file)
コード例 #21
0
    def create_env(n_envs, eval_env=False):
        if algo in ['a2c', 'acer', 'acktr', 'ppo2']:
            if n_envs > 1:
                env = SubprocVecEnv([
                    make_env(env_id,
                             i,
                             args.seed,
                             log_dir=monitor_path,
                             wrapper_class=env_wrapper,
                             env_kwargs=env_kwargs) for i in range(n_envs)
                ])
            else:
                env = DummyVecEnv([
                    make_env(env_id,
                             0,
                             args.seed,
                             log_dir=monitor_path,
                             wrapper_class=env_wrapper,
                             env_kwargs=env_kwargs)
                ])
            env = DummyVecEnv([lambda: gym.make(env_id, **env_kwargs)])
            if env_wrapper is not None:
                env = env_wrapper(env)
        elif ((algo in ['dqn', 'her', 'sac', 'td3']) and n_envs > 1):
            raise ValueError(
                "Error: {} does not support multiprocessing!".format(algo))
        elif ((algo in ['ddpg', 'ppo1', 'trpo', 'gail']) and n_envs > 1):
            raise ValueError(
                "Error: {} uses MPI for multiprocessing!".format(algo))
        else:
            env = make_vec_env(env_id,
                               n_envs=n_envs,
                               seed=args.seed,
                               monitor_dir=monitor_path,
                               wrapper_class=env_wrapper,
                               env_kwargs=env_kwargs)

        if args.normalize:  # choose from multiple options
            # env = VecNormalize(env, clip_obs=np.inf)
            env = VecNormalize(env, norm_reward=False, clip_obs=np.inf)
            # env = VecNormalize(env, norm_reward=False, clip_obs=np.inf, **normalize_kwargs)
        return env
コード例 #22
0
def main():
    args = parse_args()

    env = make_env(args.env)
    model = get_model(args.policy_ckpt_dir)
    if args.reward_predictor_ckpt_dir:
        reward_predictor = get_reward_predictor(args.reward_predictor_ckpt_dir)
    else:
        reward_predictor = None

    run_agent(env, model, reward_predictor, args.frame_interval_ms)
コード例 #23
0
ファイル: algo.py プロジェクト: cclauss/lagom
    def __call__(self, config):
        # Set random seeds: PyTorch, numpy.random, random
        set_global_seeds(seed=config['seed'])
        
        # Create environment and seed it
        env = make_env(seed=config['seed'], 
                       monitor=False, 
                       monitor_dir=None)
        # Create environment specification
        env_spec = EnvSpec(env)  # TODO: integrate within make_env globally
        
        # Create device
        device = torch.device('cuda' if config['cuda'] else 'cpu')
        
        # Create logger
        logger = Logger(name='logger')
        
        # Create policy
        network = MLP(config=config)
        policy = CategoricalPolicy(network=network, env_spec=env_spec)
        policy.network = policy.network.to(device)

        # Create optimizer
        optimizer = optim.Adam(policy.network.parameters(), lr=config['lr'])
        # Learning rate scheduler
        max_epoch = config['train_iter']  # Max number of lr decay, Note where lr_scheduler put
        lambda_f = lambda epoch: 1 - epoch/max_epoch  # decay learning rate for each training epoch
        lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_f)
        
        # Create agent
        agent_class = ActorCriticAgent#REINFORCEAgent
        agent = agent_class(policy=policy, 
                            optimizer=optimizer, 
                            config=config, 
                            lr_scheduler=lr_scheduler, 
                            device=device)
        
        # Create runner
        runner = Runner(agent=agent, 
                        env=env, 
                        gamma=config['gamma'])
        
        # Create engine
        engine = Engine(agent=agent, 
                        runner=runner, 
                        config=config, 
                        logger=logger)
        
        # Training
        train_output = engine.train()
        np.save('logs/returns_ActorCritic', train_output)
        
        return None
コード例 #24
0
def test(test_n, seed, model_filename, vec_filename, train, test, test_as_class=0, render=False, save_file="default.yml"):

    print("Testing:")
    total_rewards = []
    distance_xs = []
    for i in range(test_n):
        print(f" Seed {seed+i}, model {model_filename} vec {vec_filename}")
        print(f" Train on {train}, test on {test}, w/ bodyinfo {test_as_class}")
        eval_env = utils.make_env(render=render, wrapper=None, robot_body=test, body_info=test_as_class)
        eval_env = DummyVecEnv([eval_env])
        eval_env = VecNormalize.load(vec_filename, eval_env)
        eval_env.norm_reward = False

        eval_env.seed(seed+i)
        model = PPO.load(model_filename)

        obs = eval_env.reset()
        if render:
            eval_env.env_method("set_view")
        distance_x = 0
        # print(obs)
        total_reward = 0
        for step in range(1000):
            action, _states = model.predict(obs, deterministic=True)
            obs, reward, done, info = eval_env.step(action)
            if done:
                break
            else:  # the last observation will be after reset, so skip the last
                distance_x = eval_env.envs[0].robot.body_xyz[0]
            total_reward += reward[0]
            if render:
                time.sleep(0.01)

        eval_env.close()
        print(f"train {train}, test {test}, test_as_class {test_as_class}, step {step}, total_reward {total_reward}, distance_x {distance_x}")

        total_rewards.append(total_reward)
        distance_xs.append(distance_x)

    # avoid yaml turn float64 to numpy array
    total_rewards = [float(x) for x in total_rewards]
    distance_xs = [float(x) for x in distance_xs]

    data = {
        "title": "test",
        "train": train,
        "test": test,
        "total_reward": total_rewards,
        "distance_x": distance_xs,
    }
    with open(f"{save_file}", "w") as f:
        yaml.dump(data, f)
コード例 #25
0
def _train(env_id, model_params, total_epochs, use_sigmoid_layer=False, is_evaluation=False):
  if is_evaluation: # evaluate_policy() must only take one environment
    envs = SubprocVecEnv([make_env(env_id)])
  else:
    envs = SubprocVecEnv([make_env(env_id) for _ in range(NUM_CPU)])
  envs = VecNormalize(envs) # normalize the envs during training and evaluation

  # activation fn: use tanh for delta hedging and relu for mean reversion
  # learning rate: use 1e-7 for delta hedging and 1e-5 for mean reversion
  if use_sigmoid_layer:
    model = PPO2(SigmoidMlpPolicy, envs, n_steps=1, nminibatches=1,
                 learning_rate=lambda f: f * 1e-5, verbose=1,
                 policy_kwargs=dict(act_fun=tf.nn.relu),
                 **model_params)
  else:
    model = PPO2(MlpLstmPolicy, envs, n_steps=1, nminibatches=1,
                 learning_rate=lambda f: f * 1e-5, verbose=1,
                 policy_kwargs=dict(act_fun=tf.nn.relu),
                 **model_params)

  model.learn(total_timesteps=total_epochs * L)
  return envs, model
コード例 #26
0
ファイル: test.py プロジェクト: Yoshi-0921/MADDPG-PL
 def __init__(self):
     self.env = make_env(scenario_name='scenarios/new_env')#'simple_spread')
     self.num_agents = self.env.n
     self.agents = [DDPGAgent(self.env,
                              agent_id,
                              actor_lr=0.0,
                              critic_lr=0.0,
                              gamma=1.0) for agent_id in range(self.num_agents)]
     for agent in self.agents:
         #agent.actor.load_state_dict(torch.load('./saved_weights/actor_3000.weights', map_location=torch.device('cpu')))
         #agent.critic.load_state_dict(torch.load('./saved_weights/critic_3000.weights', map_location=torch.device('cpu')))
         pass
     self.reset()
コード例 #27
0
ファイル: rc_rl.py プロジェクト: Kostis-S-Z/exploring_meta
def sanity_check(env_name, model_1, model_2, rep_params):
    # Sample a sanity batch
    env = make_env(env_name, 1, rep_params['seed'], max_path_length=rep_params['max_path_length'])

    env.active_env.random_init = False

    sanity_task = env.sample_tasks(1)

    with torch.no_grad():
        env.set_task(sanity_task[0])
        env.seed(rep_params['seed'])
        env.reset()
        env_task = Runner(env)
        init_sanity_ep = env_task.run(model_1, episodes=1)

        env.set_task(sanity_task[0])
        env.seed(rep_params['seed'])
        env.reset()
        env_task = Runner(env)
        adapt_sanity_ep = env_task.run(model_2, episodes=1)
        env_task.reset()
        adapt_2_sanity_ep = env_task.run(model_2, episodes=1)

        init_san_rew = init_sanity_ep.reward().sum().item()
        adapt_san_rew = adapt_sanity_ep.reward().sum().item()
        adapt_2_san_rew = adapt_2_sanity_ep.reward().sum().item()

        # print(f'Why are these not equal? They should be equal: {init_san_rew}={adapt_san_rew}={adapt_2_san_rew}')
        # assert (init_san_rew == adapt_san_rew), "Environment initial states are random"
        init_sanity_state = init_sanity_ep[0].state

        init_rep_sanity = model_1.get_representation(init_sanity_state)
        init_rep_sanity_2 = model_1.get_representation(init_sanity_state, layer=3)

        adapt_rep_sanity = model_2.get_representation(init_sanity_state)
        adapt_rep_sanity_2 = model_2.get_representation(init_sanity_state, layer=3)

        init_rep_array = init_rep_sanity.detach().numpy()
        init_rep_2_array = init_rep_sanity_2.detach().numpy()
        adapt_rep_array = adapt_rep_sanity.detach().numpy()
        adapt_rep_2_array = adapt_rep_sanity_2.detach().numpy()

        print(f'Are the representations of the two models for the same state identical? '
              f'{np.array_equal(init_rep_array, adapt_rep_array)}')

        assert np.array_equal(init_rep_array, adapt_rep_array), "Representations not identical"
        assert np.array_equal(init_rep_2_array, adapt_rep_2_array), "Representations not identical"
コード例 #28
0
def _eval_model(model, env_id, ob_shape, num_eps, plot=False):
  test_env = SubprocVecEnv([make_env(env_id)])
  sharpe_ratios = []
  for episode in range(num_eps):
    # Padding zeros to the test env to match the shape of the training env.
    zero_completed_obs = np.zeros((NUM_CPU,) + ob_shape)
    zero_completed_obs[0, :] = test_env.reset()
    state = None
    for _ in range(L):
      action, state = model.predict(zero_completed_obs, state=state, deterministic=True)
      zero_completed_obs[0, :], reward, done, _ = test_env.env_method('step', action[0], indices=0)[0]
    sharpe_ratios.append(test_env.env_method('get_sharpe_ratio', indices=0)[0])
    if plot: test_env.env_method('render', indices=0)
  test_env.close()
  
  # Return the average sharpe ratio
  return sum(sharpe_ratios) / len(sharpe_ratios)
コード例 #29
0
ファイル: main.py プロジェクト: hhase/spinal-navigation-rl
        def create_env(env_params):
            global hyperparams

            if algo_ in ['dqn']:
                env = gym.make(env_id, env_params=env_params)
                env.seed(args.seed)
                if env_wrapper is not None:
                    env = env_wrapper(env)
            else:
                env = DummyVecEnv([make_env(env_id, 0, args.seed, wrapper_class=env_wrapper, env_params=env_params)])
                if normalize:
                    if args.verbose > 0:
                        if len(normalize_kwargs) > 0:
                            print("Normalization activated: {}".format(normalize_kwargs))
                        else:
                            print("Normalizing input and reward")
                    env = VecNormalize(env, **normalize_kwargs)
            return env
コード例 #30
0
ファイル: myMain.py プロジェクト: MoCuishle28/offlineRL
def eval_policy(policy, env_name, seed, eval_episodes=10):
	eval_env, _, _, _ = utils.make_env(env_name, atari_preprocessing)
	eval_env.seed(seed + 100)

	avg_reward = 0.
	for _ in range(eval_episodes):
		state, done = eval_env.reset(), False
		while not done:
			action = policy.select_action(np.array(state), eval=True)
			state, reward, done, _ = eval_env.step(action)
			avg_reward += reward

	avg_reward /= eval_episodes

	print("---------------------------------------")
	print(f"Evaluation over {eval_episodes} episodes: {avg_reward:.3f}")
	print("---------------------------------------")
	return avg_reward