コード例 #1
0
 def load_new_opp(self, idx, opp_fp, opp_elo):
     if idx < len(self.opponents):
         self.opponents[idx] = (PPO.load(opp_fp), opp_elo, opp_fp)
         self.curr_opp = idx
     else:
         self.opponents.append((PPO.load(opp_fp), opp_elo, opp_fp))
         self.curr_opp = len(self.opponents) - 1
コード例 #2
0
ファイル: pinokio5.py プロジェクト: JEdward7777/Pinokio
def main():

    tensorboard_log = "./log"

    env = Pinokio5()
    # Optional: PPO2 requires a vectorized environment to run
    # the env is now wrapped automatically when passing it to the constructor
    # env = DummyVecEnv([lambda: env])

    if os.path.exists(save_file):
        model = PPO.load(save_file,
                         env=DummyVecEnv([lambda: env]),
                         tensorboard_log=tensorboard_log)
    else:
        model = PPO(MlpPolicy, env, verbose=1, tensorboard_log=tensorboard_log)

    try:
        while True:
            #model.learn(total_timesteps=10000)
            model.learn(total_timesteps=8000000, tb_log_name=tb_log_name)

            model.save(save_file)

            obs = env.reset()
            for i in range(100):
                action, _states = model.predict(obs)
                obs, reward, done, info = env.step(action)
                env.render()
                if done:
                    print("resetting because " + str(done))
                    env.reset()
    except KeyboardInterrupt:
        print("Saving before exiting...")
        model.save(save_file)
        print("k bye")
コード例 #3
0
ファイル: simulate.py プロジェクト: 3neutronstar/flow_RL
def run_model_stablebaseline3(flow_params,
                              num_cpus=1,
                              rollout_size=5,
                              num_steps=5):
    from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
    from stable_baselines3 import PPO
    from stable_baselines3.ppo import MlpPolicy
    import torch.nn as nn

    if num_cpus == 1:
        constructor = env_constructor(params=flow_params, version=0)()
        # The algorithms require a vectorized environment to run
        env = DummyVecEnv([lambda: constructor])
    else:
        env = SubprocVecEnv([
            env_constructor(params=flow_params, version=i)
            for i in range(num_cpus)
        ])

    train_model = PPO(MlpPolicy,
                      env=env,
                      verbose=1,
                      n_epochs=rollout_size,
                      tensorboard_log="./PPO_tensorboard/",
                      device="cuda")  # cpu, gpu selection
    # automatically select gpu
    train_model.learn(total_timesteps=num_steps * rollout_size)  #
    return train_model
コード例 #4
0
def train(run_name: str, config: Dict[str, Any]):
    cfg_t = config['train']
    cfg_p = config['preprocess']
    run_dir = get_run_dir(run_name)
    dump_config(run_dir, config)
    os.makedirs(run_dir, exist_ok=False)

    def _make_env(n_envs: int, is_eval: bool):
        return make_env(seed=0,
                        n_envs=n_envs,
                        run_dir=run_dir,
                        frame_skip=cfg_p['frame_skip'],
                        frame_stack=cfg_p['frame_stack'],
                        is_eval=is_eval)

    train_env = _make_env(cfg_t['n_envs'], False)
    eval_env = _make_env(1, False)
    model = PPO('CnnPolicy',
                train_env,
                n_steps=cfg_t['n_steps'],
                n_epochs=cfg_t['n_epochs'],
                batch_size=cfg_t['batch_size'],
                learning_rate=cfg_t['lr'],
                tensorboard_log=os.path.join(run_dir, 'tb'))
    model.learn(cfg_t['total_steps'],
                eval_env=eval_env,
                eval_freq=cfg_t['eval_freq'] // cfg_t['n_envs'],
                n_eval_episodes=cfg_t['n_eval_eps'],
                eval_log_path=run_dir)
コード例 #5
0
ファイル: train.py プロジェクト: gkswamy98/pillbox
def train_advil(env, n=0):
    venv = gym.make(env)
    for i in range(n):
        mean_rewards = []
        std_rewards = []
        for num_trajs in range(0, 26, 5):
            if num_trajs == 0:
                expert_data = make_sa_dataloader(env, normalize=True)
                pi = advil_training(expert_data, venv, iters=0)
            else:
                expert_data = make_sa_dataloader(env,
                                                 max_trajs=num_trajs,
                                                 normalize=True,
                                                 batch_size=1024)
                pi = advil_training(expert_data, venv)

            def get_policy(*args, **kwargs):
                return pi

            model = PPO(get_policy, env, verbose=1)
            mean_reward, std_reward = evaluate_policy(model,
                                                      model.get_env(),
                                                      n_eval_episodes=10)
            mean_rewards.append(mean_reward)
            std_rewards.append(std_reward)
            print("{0} Trajs: {1}".format(num_trajs, mean_reward))
            np.savez(os.path.join("learners", env,
                                  "advil_rewards_{0}".format(i)),
                     means=mean_rewards,
                     stds=std_rewards)
コード例 #6
0
def init_adv(adv_env_id, disable_adv=False, env_kwargs=None):
    bridge = Bridge()
    default_env_kwargs = {
        'renders' if 'CartPole' in adv_env_id else 'render': render
    }
    if env_kwargs is None:
        env_kwargs = {}
    env_kwargs.update(default_env_kwargs)
    env = make_vec_env(adv_env_id, env_kwargs=env_kwargs, seed=seed)
    env = VecNormalize(env)
    prot_agent = PPO('MlpPolicy',
                     env,
                     verbose=verbose,
                     seed=seed,
                     n_steps=ts,
                     bridge=bridge,
                     is_protagonist=True)
    if disable_adv:
        bridge.link_agents(prot_agent, None)
    else:
        adv_agent = PPO('MlpPolicy',
                        env,
                        verbose=verbose,
                        seed=seed,
                        n_steps=ts,
                        bridge=bridge,
                        is_protagonist=False)
        bridge.link_agents(prot_agent, adv_agent)
    return prot_agent, env
コード例 #7
0
def save_new_model(name, env, num_envs, model_dir, batch_size=None, n_steps=None,
        n_epochs=None, clip_range=None, gamma=None, gae_lambda=None, vf_coef=None,
        ent_coef=None, learning_rate=None, image_based=False, image_pretrain=None,
        verbose=0, w=.1):
    if not batch_size:
        batch_size = choose_hyperp("batch_size", 10, w=w)
    if not n_steps:
        n_steps = max(batch_size, choose_hyperp("n_steps", 10, w=w))//num_envs
    if not n_epochs:
        n_epochs = choose_hyperp("n_epochs", 2, w=w)
    if not clip_range:
        clip_range = choose_hyperp("clip_range", 1, w=w)
    if not gamma:
        gamma = choose_hyperp("gamma", 2, w=w)
    if not gae_lambda:
        gae_lambda = choose_hyperp("gae_lambda", 1, w=w)
    if not vf_coef:
        vf_coef = choose_hyperp("vf_coef", 0, w=w)
    if not ent_coef:
        ent_coef = choose_hyperp("ent_coef", 0, w=w)
    if not learning_rate:
        learning_rate = choose_hyperp("learning_rate", 5, w=w)
        
    feature_extractor = "MlpPolicy"
    if image_based:
        feature_extractor = "CnnPolicy"
    
    model = PPO(feature_extractor, env, batch_size=batch_size, n_steps=n_steps, 
                n_epochs=n_epochs, clip_range=clip_range, gamma=gamma, gae_lambda=gae_lambda,
                vf_coef=vf_coef, ent_coef=ent_coef, learning_rate=learning_rate, verbose=verbose)
    if image_based and image_pretrain:
        model.policy.features_extractor.cnn.load_state_dict(T.load(image_pretrain+"_cnn.pth"))
        model.policy.features_extractor.linear.load_state_dict(T.load(image_pretrain+"_linear.pth"))
    model.save(model_dir + name + '/' + name + "_0")
    return model
コード例 #8
0
def main(args):
    wandb.init(project=args.project_name, name=args.run_name)
    n_envs = len(os.sched_getaffinity(0))
    factory = EnvFactory(args.env)

    # Wrap the
    render_env = factory.make_env()  # for rendering

    callback = CallbackList([])

    # Wrap the environment around parallel processing friendly wrapper, unless debug is on
    if args.debug:
        envs = DummyVecEnv([factory.make_env for _ in range(n_envs)])
    else:
        envs = SubprocVecEnv([factory.make_env for _ in range(n_envs)])

    if args.stats_path is None:
        envs = VecNormalize(envs,
                            norm_obs=True,
                            clip_obs=np.inf,
                            norm_reward=False,
                            clip_reward=np.inf)
    else:
        envs = VecNormalize.load(args.stats_path, envs)
    eval_callback = WAndBEvalCallback(render_env, args.eval_every, envs)
    callback.callbacks.append(eval_callback)

    print("Do random explorations to build running averages")
    envs.reset()
    for _ in tqdm(range(1000)):
        random_action = np.stack(
            [envs.action_space.sample() for _ in range(n_envs)])
        envs.step(random_action)
    envs.training = False  # freeze the running averages (what a terrible variable name...)

    # We use PPO by default, but it should be easy to swap out for other algorithms.
    if args.pretrained_path is not None:
        pretrained_path = args.pretrained_path
        learner = PPO.load(pretrained_path, envs, device=args.device)
        learner.learn(total_timesteps=args.total_timesteps, callback=callback)
    else:
        policy_kwargs = dict(
            activation_fn=nn.ReLU,
            net_arch=[dict(vf=args.value_dims, pi=args.policy_dims)],
            log_std_init=args.log_std_init,
            squash_output=False)

        learner = PPO(MlpPolicy,
                      envs,
                      n_steps=args.n_steps,
                      verbose=1,
                      policy_kwargs=policy_kwargs,
                      device=args.device,
                      target_kl=2e-2)
        if args.device == 'cpu':
            torch.cuda.empty_cache()
        learner.learn(total_timesteps=args.total_timesteps, callback=callback)

    render_env.close()
    envs.close()
コード例 #9
0
def train():
    train_images, test_images = load_data("dataset")

    env = Monitor(
        PuzzleEnv(images=train_images,
                  img_size=IMG_SIZE,
                  channel_num=CHANNEL_NUM,
                  puzzle_size=(3, 3),
                  max_step_num=100,
                  puzzle_type="switch",
                  dist_type="manhattan",
                  penalty_for_step=-0.2,
                  reward_for_completiton=20,
                  positive_reward_coefficient=1.0,
                  obs_conf=OBS_CONF))

    policy_kwargs = dict(
        features_extractor_class=CustomCNN,
        features_extractor_kwargs=dict(features_dim=128),
    )

    model = PPO('CnnPolicy',
                env,
                policy_kwargs=policy_kwargs,
                verbose=1,
                learning_rate=0.0005,
                seed=42)
    model.learn(total_timesteps=1000000)

    test(model, test_images)
コード例 #10
0
ファイル: rl_demo.py プロジェクト: yonkshi/bulb
def main(args):
    envs = make_vec_env(args.env_name,
                        n_envs=args.num_envs,
                        vec_env_cls=SubprocVecEnv)
    viz_env = None
    if args.viz:
        nm_core, nm_vrsn, = args.env_name.split('-')
        nm_core += 'Viz' if args.viz else 'Dbg' if args.debug else ''
        viz_env = make_vec_env(nm_core + '-' + nm_vrsn, n_envs=1)
    rl_learner = PPO('MlpPolicy',
                     envs,
                     verbose=1,
                     seed=args.seed,
                     device='cpu')
    for epoch in range(args.num_epochs):
        rl_learner.learn(args.steps_per_epoch)
        if args.viz:
            obs = viz_env.reset()
            done = False
            while not done:
                act, _ = rl_learner.predict(obs)
                if len(act.shape) > len(viz_env.action_space.shape):
                    act = act[0:1]  # just one viz env
                obs, rwd, done, _ = viz_env.step(act)
                time.sleep(0.01)  # to make motions visible
コード例 #11
0
def main():
    num_cpu = 1
    load_version = ''
    save_version = '1b_v0'
    load_dir = '../models'
    save_dir = '../models'
    timesteps_per_checkpoint = int(1e6)
    num_checkpoints = int(1e1)  # controlling performance level of agent

    try:
        os.mkdir(save_dir)
    except OSError as error:
        pass

    alg_env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
    print('created alg env')

    train_policy = 'MlpPolicy'
    load_path = '{}/alg_v{}.zip'.format(load_dir, load_version)
    if os.path.exists(load_path):
        alg = PPO(train_policy, alg_env, verbose=0)
        alg.set_parameters(load_path, exact_match=True)
        # alg = PPO.load(load_path, env=alg_env)
        print('loaded alg checkpoint' + load_path)
    else:
        alg = PPO(train_policy, alg_env, verbose=0)
        print('created alg model')

    save_path = '{}/alg_v{}.zip'.format(save_dir, save_version)
    for _ in range(num_checkpoints):
        alg.learn(total_timesteps=timesteps_per_checkpoint)
        alg.save(save_path)
        print('saved alg checkpoint' + save_path)
コード例 #12
0
def main():
    base_args, base_parser = get_logger2_args()
    args = get_args(base_parser)
    args.device = init_gpus_and_randomness(args.seed, args.gpu)
    logger = Logger2('/tmp/tmp', use_tensorboardX=True)
    logger.log_tb_object(args, 'args')
    envs = make_vec_env(args.env_name,
                        n_envs=args.num_envs,
                        vec_env_cls=SubprocVecEnv)
    viz_env = None
    if args.visualize:
        nm_core, nm_vrsn, = args.env_name.split('-')
        nm_core += 'Viz' if args.visualize else 'Dbg' if args.debug else ''
        viz_env = make_vec_env(nm_core + '-' + nm_vrsn, n_envs=1)
    rl_learner = PPO('MlpPolicy',
                     envs,
                     verbose=1,
                     seed=args.seed,
                     device='cpu')
    for epoch in range(args.num_epochs):
        rl_learner.learn(args.steps_per_epoch)
        if args.visualize:
            obs = viz_env.reset()
            done = False
            while not done:
                act, _ = rl_learner.predict(obs)
                if len(act.shape) > len(viz_env.action_space.shape):
                    act = act[0:1]  # just one viz env
                obs, rwd, done, _ = viz_env.step(act)
                time.sleep(0.01)  # to make motions visible
コード例 #13
0
 def __call__(self):
     policy_kwargs = dict(activation_fn=th.nn.ReLU)
     model = PPO('CnnPolicy',
                 self.env,
                 learning_rate=1e-3,
                 policy_kwargs=policy_kwargs).learn(self.total_time_steps)
     model.save('PPO_' + self.game_name)
     del model   # since the model has been trained, its no longer needed any more...
コード例 #14
0
def test_vec_with_ppo():
    """
    Test the `VecExtractDictObs` with PPO
    """
    env = DictObsVecEnv()
    env = VecExtractDictObs(env, "rgb")
    monitor_env = VecMonitor(env)
    model = PPO("MlpPolicy", monitor_env, verbose=1, n_steps=64, device="cpu")
    model.learn(total_timesteps=250)
コード例 #15
0
def main():
    #env_id = "CartPole-v1"
    vix_env = trading_vix_env.trading_vix_env()
    num_cpu = 20  # Number of processes to use
    # Create the vectorized environment
    env = SubprocVecEnv([make_env(vix_env, i) for i in range(num_cpu)])

    model = PPO('MlpPolicy', env, verbose=1, n_steps=500, batch_size=10000)
    model.learn(total_timesteps=2500000000)
コード例 #16
0
class Agent(object):
    def __init__(self, env, model=None):
        if model:
            self.model = model
        else:
            self.log_dir = "ppo_cnn/" + str(datetime.datetime.now()).replace(
                ":", "-")
            os.makedirs(self.log_dir, exist_ok=True)
            monitor_env = Monitor(env, self.log_dir, allow_early_resets=True)
            vec_env = DummyVecEnv([lambda: monitor_env])
            policy_kwargs = dict(
                features_extractor_class=CustomCNN,
                features_extractor_kwargs=dict(features_dim=256),
                net_arch=[dict(pi=[64, 64], vf=[64, 64])])
            self.model = PPO(CustomCnnPolicy,
                             vec_env,
                             policy_kwargs=policy_kwargs,
                             verbose=1,
                             learning_rate=0.001)

    def function(self, obs, conf):
        import random
        col, _ = self.model.predict(np.array(obs['board']).reshape(
            6, 7, 1))  # TODO: Connect-4 specific so far
        is_valid = (obs['board'][int(col)] == 0)
        if is_valid:
            return int(col)
        else:
            return random.choice([
                col for col in range(config.columns)
                if obs.board[int(col)] == 0
            ])

    def train(self, timesteps):
        self.model.learn(total_timesteps=timesteps)

    def save(self, name: str):
        self.model.save(name)

    def load(self, name: str, env, replace_parameters=None):
        self.log_dir = "ppo_cnn/" + str(datetime.datetime.now()).replace(
            ":", "-")
        os.makedirs(self.log_dir, exist_ok=True)
        monitor_env = Monitor(env, self.log_dir, allow_early_resets=True)
        vec_env = DummyVecEnv([lambda: monitor_env])
        self.model = PPO.load(name,
                              env=vec_env,
                              custom_objects=replace_parameters)

    def plot(self):
        # Plot cumulative reward
        with open(os.path.join(self.log_dir, "monitor.csv"), 'rt') as fh:
            firstline = fh.readline()
            assert firstline[0] == '#'
            df = pd.read_csv(fh, index_col=None)['r']
        df.rolling(window=1000).mean().plot()
        plt.show()
コード例 #17
0
def test_ppo_warnings():
    """Test that PPO warns and errors correctly on
    problematic rollour buffer sizes"""

    # Only 1 step: advantage normalization will return NaN
    with pytest.raises(AssertionError):
        PPO("MlpPolicy", "Pendulum-v0", n_steps=1)

    # Truncated mini-batch
    with pytest.warns(UserWarning):
        PPO("MlpPolicy", "Pendulum-v0", n_steps=6, batch_size=8)
コード例 #18
0
ファイル: test-PPO.py プロジェクト: jackt53ng/gym_fishing
def test_ppo():
    env = gym.make("fishing-v1")
    check_env(env)
    # takes about 200000 to get a decent policy, about a 12 min test..
    model = PPO("MlpPolicy", env, verbose=0)
    model.learn(total_timesteps=200)

    # Simulate a run with the trained model, visualize result
    df = env.simulate(model)
    env.plot(df, "PPO-test.png")

    # Evaluate model
    mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=5)
コード例 #19
0
def test_vec_monitor_ppo(recwarn):
    """
    Test the `VecMonitor` with PPO
    """
    env = DummyVecEnv([lambda: gym.make("CartPole-v1")])
    env.seed(0)
    monitor_env = VecMonitor(env)
    model = PPO("MlpPolicy", monitor_env, verbose=1, n_steps=64, device="cpu")
    model.learn(total_timesteps=250)

    # No warnings because using `VecMonitor`
    evaluate_policy(model, monitor_env)
    assert len(recwarn) == 0
コード例 #20
0
def main():
  # Create the callback: check every 1000 steps
  log_dir = 'log'
  callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir)
  num_cpu = 16
  model_stats_path = os.path.join(log_dir, "sac_" + env_name)
  env_stats_path = os.path.join(log_dir, 'sac_LR001.pkl')
  tb_log = 'tb_log'
  videoName = '5M_timesteps_sac'
  tb_log_name = videoName

  if(StartFresh):
        # env = make_vec_env(env_name, n_envs=4)
        # env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
        env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
        env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.)
        env.reset()
        policy_kwargs = {
            'net_arch':[128,64,32],
        }
        model = PPO('MlpPolicy', 
          env, 
          learning_rate = 0.001,
          n_steps=500,
          # batch_size=0,
          # n_epochs=1,
          gamma=0.9,
          policy_kwargs = policy_kwargs, 
          verbose=1, 
          tensorboard_log=tb_log,
          device="auto")
  else:
      env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
      env = VecNormalize.load(env_stats_path, env)
      env.reset()

      
      model = PPO.load(model_stats_path, tensorboard_log=tb_log)
      model.set_env(env)

  if(DoTraining):
    eval_env = make_vec_env(env_name, n_envs=1)
    eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=True, clip_obs=10.)
    eval_env.reset()
    # model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=tb_log)
    model.learn(total_timesteps=25000000, tb_log_name=tb_log_name, reset_num_timesteps=False) #, callback=callback, =TensorboardCallback()

    # Don't forget to save the VecNormalize statistics when saving the agent
    model.save(model_stats_path)
    env.save(env_stats_path)
    
  if(DoVideo):
    # mean_reward, std_reward = evaluate_policy(model, eval_env)
    # print(f"Mean reward = {mean_reward:.2f} +/- {std_reward:.2f}")
    record_video(env_name, model, video_length=2000, prefix='ppo_'+ env_name + videoName)
コード例 #21
0
def train(env, log_dir):
    callback = SaveOnBestTrainingRewardCallback(check_freq=1000,
                                                log_dir=log_dir)

    env = VecNormalize(env,
                       training=True,
                       norm_obs=True,
                       norm_reward=True,
                       gamma=0.9997,
                       clip_obs=10.,
                       clip_reward=10.,
                       epsilon=0.1)

    drive = PPO("MlpPolicy",
                env,
                ent_coef=0.01,
                vf_coef=1,
                batch_size=32,
                learning_rate=linear_schedule(0.001),
                clip_range=linear_schedule(0.1),
                n_steps=1000,
                n_epochs=20,
                tensorboard_log=log_dir + "/drive_tensorboard_log",
                verbose=1)

    drive.learn(total_timesteps=total_timesteps, callback=callback)

    for i in range(total_train_runs):
        env.close()
        drive.learn(total_timesteps=total_timesteps,
                    callback=callback,
                    reset_num_timesteps=False)

    drive.save("conduziadrive")
コード例 #22
0
def run(config: Dict[str, Any], logdir: pathlib.PosixPath):
    env = make_env(config)

    if config["mode"] == "evaluate":
        print("Start evaluation.")
        model = PPO.load(logdir / "model.zip")
    elif config["mode"] == "train" and args.logdir:
        print("Start training from existing model.")
        model = PPO.load(logdir / "model.zip")
        model.set_env(env)
        model.learn(total_timesteps=config["train_steps"])
    else:
        print("Start training.")
        model = PPO(
            "CnnPolicy",
            env,
            verbose=1,
            tensorboard_log=logdir / "tensorboard",
            use_sde=True,
        )
        model.learn(total_timesteps=config["train_steps"])

    mean_reward, std_reward = evaluate_policy(
        model, env, n_eval_episodes=config["eval_eps"], deterministic=True)
    print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

    if config["mode"] == "train":
        model.save(logdir / "model")

    env.close()
コード例 #23
0
def main():
    env = gym.make(ENV_NAME)
    model = PPO('MlpPolicy', env, verbose=1)
    model.learn(total_timesteps=100000)

    obs = env.reset()
    for i in range(1000):
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, done, info = env.step(action)
        env.render()
        if done:
            obs = env.reset()

    env.close()
コード例 #24
0
ファイル: testagent.py プロジェクト: nissmar/bilboquetAI
def trained_agent(episodes=256,
                  continuous=True,
                  load=None,
                  save_name="test",
                  ent_coef=0.00001,
                  total_timesteps=25000,
                  learning_rate=lr()):
    env = gym.make("bilboquet-v0", continuous=continuous, amplitude=10)
    env.reset((300, 300))

    if load is None:
        model = PPO('MlpPolicy',
                    env,
                    verbose=1,
                    ent_coef=ent_coef,
                    learning_rate=learning_rate,
                    tensorboard_log=f"./ppo_bilboquet_tensorboard/")
        model.learn(total_timesteps=total_timesteps, tb_log_name=save_name)
        model.save(save_name + '.zip')
        print('DONE')
        obs = env.reset()
    else:
        model = PPO.load(load)
        obs = env.reset()

    for i in range(episodes):
        action, _states = model.predict(obs, deterministic=True)
        # print(action)
        obs, reward, done, info = env.step(action)
        # print(reward)
        env.render()
        if done:
            obs = env.reset()
コード例 #25
0
ファイル: run_rl.py プロジェクト: WSH95/my_DRL_sim
def main():
    test_or_train = TEST_OR_TRAIN
    assert test_or_train in ["train", "test"]
    gym_config = SimulationParameters(time_step=TIME_STEP)
    robot_class = QuadrupedRobot
    robot_params = MiniCheetahParams(
        on_rack=False,
        enable_self_collision=True,
        motor_control_mode=MotorControlMode.HYBRID_COMPUTED_POS_TROT)
    task = TestTask(train_or_test=TEST_OR_TRAIN)

    env = LocomotionGymEnv(gym_config, robot_class, robot_params, task)

    policy_save_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                   'data/policies')
    if not (os.path.exists(policy_save_dir)):
        os.makedirs(policy_save_dir)

    policy_save_filename = 'ppo_' + str(COUNT) + '_' + time.strftime(
        "%d-%m-%Y_%H-%M-%S")
    policy_save_path = os.path.join(policy_save_dir, policy_save_filename)

    if TEST_OR_TRAIN == "train":
        model = PPO('MlpPolicy', env, verbose=1)
        model.learn(total_timesteps=100000000)
        model.save(policy_save_path)
    else:
        model = PPO.load(POLICY_SAVE_PATH)
        obs = env.reset()
        while True:
            action, _state = model.predict(obs, deterministic=True)
            obs, reward, done, info = env.step(action)
            env.render()
            if done:
                obs = env.reset()
コード例 #26
0
ファイル: pinokio3.py プロジェクト: JEdward7777/Pinokio
def main():

    tensorboard_log = "./log"

    env = Pinokio3()
    # Optional: PPO2 requires a vectorized environment to run
    # the env is now wrapped automatically when passing it to the constructor
    # env = DummyVecEnv([lambda: env])

    if os.path.exists( save_file ):
        model = PPO.load( save_file, env=DummyVecEnv([lambda:env]),tensorboard_log=tensorboard_log )
    else:
        policy_kwargs = dict(activation_fn=th.nn.ReLU, net_arch=net_arch)
        model = PPO(MlpPolicy, DummyVecEnv([lambda:env]), verbose=1,tensorboard_log=tensorboard_log)

    #https://stable-baselines3.readthedocs.io/en/master/guide/callbacks.html
    checkpoint_callback = CheckpointCallback(save_freq=10000, save_path='./checkpoints/',
                                         name_prefix='pinokio3')


    while True:
        model.learn(total_timesteps=15000000, callback=checkpoint_callback, tb_log_name=tb_log_name )

        model.save( save_file )
        print( "saved" )

        obs = env.reset()
        for i in range(20):
            action, _states = model.predict(obs)
            obs, reward, done, info = env.step(action)
            print( "action {} -> reward {}".format( env.decode_action(action), reward ) )
            env.render()
            if done:
                print( "resetting because " + str(done) )
                env.reset()
コード例 #27
0
def main():

    env = Pinokio2()
    # Optional: PPO2 requires a vectorized environment to run
    # the env is now wrapped automatically when passing it to the constructor
    # env = DummyVecEnv([lambda: env])

    if os.path.exists(save_file):
        model = PPO.load(save_file, env=DummyVecEnv([lambda: env]))
    else:
        model = PPO(MlpPolicy, env, verbose=1)

    while True:
        #model.learn(total_timesteps=10000)
        model.learn(total_timesteps=100000)

        model.save(save_file)

        obs = env.reset()
        for i in range(10):
            action, _states = model.predict(obs)
            obs, reward, done, info = env.step(action)
            env.render()
            if done:
                print("resetting because " + str(done))
                env.reset()
コード例 #28
0
ファイル: test.py プロジェクト: kdh0429/TorchDeepMimic
def main():
    # multiprocess environment
    # n_cpu = 8
    # env = SubprocVecEnv([lambda: gym.make('DYROSTocabi-v1') for i in range(n_cpu)])
    # env = VecNormalize(env, norm_obs=True, clip_obs=2.0, norm_reward=False, training=True)

    n_cpu = 1
    env = gym.make('DYROSTocabi-v1')
    env = DummyVecEnv([lambda: env])
    env = VecNormalize(env,
                       norm_obs=True,
                       clip_obs=2.0,
                       norm_reward=False,
                       training=True)

    model = PPO('MlpPolicy',
                env,
                verbose=1,
                n_steps=int(4096 / n_cpu),
                wandb_use=False)
    model.learn(total_timesteps=40000000)
    file_name = "ppo2_DYROSTocabi_" + str(datetime.datetime.now())
    model.save(file_name)
    env.save(file_name + "_env.pkl")

    model.policy.to("cpu")
    for name, param in model.policy.state_dict().items():
        weight_file_name = "./result/" + name + ".txt"
        np.savetxt(weight_file_name, param.data)

    np.savetxt("./result/obs_mean.txt", env.obs_rms.mean)
    np.savetxt("./result/obs_variance.txt", env.obs_rms.var)

    del model  # remove to demonstrate saving and loading
    del env

    # file_name = "ppo2_DYROSTocabi_2021-01-08 07:18:00.267089"

    env = gym.make('DYROSTocabi-v1')
    env = DummyVecEnv([lambda: env])
    env = VecNormalize.load(file_name + "_env.pkl", env)
    env.training = False

    model = PPO.load(file_name, env=env, wandb_use=False)

    #Enjoy trained agent
    obs = np.copy(env.reset())
    epi_reward = 0

    while True:
        action, _states = model.predict(obs, deterministic=True)

        obs, rewards, dones, info = env.step(action)
        env.render()
        epi_reward += rewards

        if dones:
            print("Episode Reward: ", epi_reward)
            epi_reward = 0
コード例 #29
0
def load_model(model_path,
               policy_class,
               policy_kwargs,
               env,
               hp,
               partners,
               testing,
               try_load=True):
    load_successful = False

    if try_load:
        try:
            model = PPO.load(model_path)  #, policy_kwargs=policy_kwargs)
            load_successful = True
            print("Model loaded successfully")
        except Exception as e:
            print("Could not load model", e)

    if not load_successful:
        print("Create new model")

        n_steps, batch_size, n_epochs, = hp['n_steps'], hp['batch_size'], hp[
            'n_epochs']
        model = PPO(policy_class,
                    env,
                    policy_kwargs=policy_kwargs,
                    n_steps=n_steps,
                    batch_size=batch_size,
                    n_epochs=n_epochs,
                    verbose=0,
                    ent_coef=0.00,
                    marginal_reg_coef=hp['mreg'])

        for name, param in model.policy.named_parameters():
            if param.requires_grad:
                print(name, param.data.size())

    vec_env = DummyVecEnv([lambda: env])
    model.set_env(vec_env)

    model.policy.set_partners(partners)
    if testing:
        model.policy.num_partners = 1  # only test 1 partner
        model.marginal_reg_coef = 0
        model.n_epochs = hp['n_epochs_testing']
        model.n_steps = hp['n_steps_testing']
        model._init_rollout_buffer()

    return model
コード例 #30
0
def main():
    #env_id = "CartPole-v1"
    vix_env = trading_vix_env.trading_vix_env()
    num_cpu = 20  # Number of processes to use
    # Create the vectorized environment
    env = SubprocVecEnv([make_env(vix_env, i) for i in range(num_cpu)])

    # Create log dir
    log_dir = './ppo_data'
    os.makedirs(log_dir, exist_ok=True)
    env = VecMonitor(env, log_dir)
    callback = custom_call_back.CustomCallback(check_freq = 1000,log_dir = log_dir)

    model = PPO('MlpPolicy', env, verbose=1,n_steps=500,batch_size = 10000)
    model.learn(total_timesteps=2500000000,callback = callback)