コード例 #1
0
def train(env, log_dir):
    callback = SaveOnBestTrainingRewardCallback(check_freq=1000,
                                                log_dir=log_dir)

    env = VecNormalize(env,
                       training=True,
                       norm_obs=True,
                       norm_reward=True,
                       gamma=0.9997,
                       clip_obs=10.,
                       clip_reward=10.,
                       epsilon=0.1)

    drive = PPO("MlpPolicy",
                env,
                ent_coef=0.01,
                vf_coef=1,
                batch_size=32,
                learning_rate=linear_schedule(0.001),
                clip_range=linear_schedule(0.1),
                n_steps=1000,
                n_epochs=20,
                tensorboard_log=log_dir + "/drive_tensorboard_log",
                verbose=1)

    drive.learn(total_timesteps=total_timesteps, callback=callback)

    for i in range(total_train_runs):
        env.close()
        drive.learn(total_timesteps=total_timesteps,
                    callback=callback,
                    reset_num_timesteps=False)

    drive.save("conduziadrive")
コード例 #2
0
ファイル: run_rl.py プロジェクト: WSH95/my_DRL_sim
def main():
    test_or_train = TEST_OR_TRAIN
    assert test_or_train in ["train", "test"]
    gym_config = SimulationParameters(time_step=TIME_STEP)
    robot_class = QuadrupedRobot
    robot_params = MiniCheetahParams(
        on_rack=False,
        enable_self_collision=True,
        motor_control_mode=MotorControlMode.HYBRID_COMPUTED_POS_TROT)
    task = TestTask(train_or_test=TEST_OR_TRAIN)

    env = LocomotionGymEnv(gym_config, robot_class, robot_params, task)

    policy_save_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                   'data/policies')
    if not (os.path.exists(policy_save_dir)):
        os.makedirs(policy_save_dir)

    policy_save_filename = 'ppo_' + str(COUNT) + '_' + time.strftime(
        "%d-%m-%Y_%H-%M-%S")
    policy_save_path = os.path.join(policy_save_dir, policy_save_filename)

    if TEST_OR_TRAIN == "train":
        model = PPO('MlpPolicy', env, verbose=1)
        model.learn(total_timesteps=100000000)
        model.save(policy_save_path)
    else:
        model = PPO.load(POLICY_SAVE_PATH)
        obs = env.reset()
        while True:
            action, _state = model.predict(obs, deterministic=True)
            obs, reward, done, info = env.step(action)
            env.render()
            if done:
                obs = env.reset()
コード例 #3
0
ファイル: pinokio3.py プロジェクト: JEdward7777/Pinokio
def main():

    tensorboard_log = "./log"

    env = Pinokio3()
    # Optional: PPO2 requires a vectorized environment to run
    # the env is now wrapped automatically when passing it to the constructor
    # env = DummyVecEnv([lambda: env])

    if os.path.exists( save_file ):
        model = PPO.load( save_file, env=DummyVecEnv([lambda:env]),tensorboard_log=tensorboard_log )
    else:
        policy_kwargs = dict(activation_fn=th.nn.ReLU, net_arch=net_arch)
        model = PPO(MlpPolicy, DummyVecEnv([lambda:env]), verbose=1,tensorboard_log=tensorboard_log)

    #https://stable-baselines3.readthedocs.io/en/master/guide/callbacks.html
    checkpoint_callback = CheckpointCallback(save_freq=10000, save_path='./checkpoints/',
                                         name_prefix='pinokio3')


    while True:
        model.learn(total_timesteps=15000000, callback=checkpoint_callback, tb_log_name=tb_log_name )

        model.save( save_file )
        print( "saved" )

        obs = env.reset()
        for i in range(20):
            action, _states = model.predict(obs)
            obs, reward, done, info = env.step(action)
            print( "action {} -> reward {}".format( env.decode_action(action), reward ) )
            env.render()
            if done:
                print( "resetting because " + str(done) )
                env.reset()
コード例 #4
0
def train_pa_ppo(path='pa_ppo'):
    """
    1/3の確率で出を出す環境での学習を行う。
    引数:
        path    学習済みモデルファイルパス
    戻り値:
        なし
    """
    print(f'train ppo with jurina_player path={path}')
    # じゃんけん環境の構築
    env = RockPaperScissorsEnv(JurinaPlayer())
    env = Monitor(env, LOGDIR, allow_early_resets=True)
    env = DummyVecEnv([lambda: env])

    # PPOモデルの初期化
    model = PPO('MlpPolicy', env, verbose=1)

    # トレーニング実行
    elapsed = time.time()
    model.learn(total_timesteps=1000000)
    print(f'elapse time: {time.time() - elapsed}sec')

    # 学習済みモデルの保存
    model.save(path)

    # じゃんけん環境のクローズ
    env.close()
コード例 #5
0
ファイル: pinokio5.py プロジェクト: JEdward7777/Pinokio
def main():

    tensorboard_log = "./log"

    env = Pinokio5()
    # Optional: PPO2 requires a vectorized environment to run
    # the env is now wrapped automatically when passing it to the constructor
    # env = DummyVecEnv([lambda: env])

    if os.path.exists(save_file):
        model = PPO.load(save_file,
                         env=DummyVecEnv([lambda: env]),
                         tensorboard_log=tensorboard_log)
    else:
        model = PPO(MlpPolicy, env, verbose=1, tensorboard_log=tensorboard_log)

    try:
        while True:
            #model.learn(total_timesteps=10000)
            model.learn(total_timesteps=8000000, tb_log_name=tb_log_name)

            model.save(save_file)

            obs = env.reset()
            for i in range(100):
                action, _states = model.predict(obs)
                obs, reward, done, info = env.step(action)
                env.render()
                if done:
                    print("resetting because " + str(done))
                    env.reset()
    except KeyboardInterrupt:
        print("Saving before exiting...")
        model.save(save_file)
        print("k bye")
コード例 #6
0
def train():
    omniverse_kit = OmniKitHelper(CUSTOM_CONFIG)

    # we disable all anti aliasing in the render because we want to train on the raw camera image.
    omniverse_kit.set_setting("/rtx/post/aa/op", 0)

    env = TestEnv(omniverse_kit, max_resets=10, updates_per_step=3)

    checkpoint_callback = CheckpointCallback(save_freq=1000,
                                             save_path="./params/",
                                             name_prefix="rl_model")

    net_arch = [512, 256, dict(pi=[128, 64, 32], vf=[128, 64, 32])]
    policy_kwargs = {
        "net_arch": net_arch,
        "features_extractor_class": CustomCNN,
        "activation_fn": torch.nn.ReLU
    }

    model = PPO("CnnPolicy",
                env,
                verbose=1,
                tensorboard_log="tensorboard",
                policy_kwargs=policy_kwargs,
                device="cuda")
    # model = PPO.load("checkpoint_25k.zip",env)
    model.learn(
        total_timesteps=25000,
        callback=checkpoint_callback,
        eval_env=env,
        eval_freq=1000,
        eval_log_path="./eval_log/",
        reset_num_timesteps=False,
    )
    model.save("checkpoint_25k")
コード例 #7
0
ファイル: testagent.py プロジェクト: nissmar/bilboquetAI
def trained_agent(episodes=256,
                  continuous=True,
                  load=None,
                  save_name="test",
                  ent_coef=0.00001,
                  total_timesteps=25000,
                  learning_rate=lr()):
    env = gym.make("bilboquet-v0", continuous=continuous, amplitude=10)
    env.reset((300, 300))

    if load is None:
        model = PPO('MlpPolicy',
                    env,
                    verbose=1,
                    ent_coef=ent_coef,
                    learning_rate=learning_rate,
                    tensorboard_log=f"./ppo_bilboquet_tensorboard/")
        model.learn(total_timesteps=total_timesteps, tb_log_name=save_name)
        model.save(save_name + '.zip')
        print('DONE')
        obs = env.reset()
    else:
        model = PPO.load(load)
        obs = env.reset()

    for i in range(episodes):
        action, _states = model.predict(obs, deterministic=True)
        # print(action)
        obs, reward, done, info = env.step(action)
        # print(reward)
        env.render()
        if done:
            obs = env.reset()
コード例 #8
0
def save_new_model(name, env, num_envs, model_dir, batch_size=None, n_steps=None,
        n_epochs=None, clip_range=None, gamma=None, gae_lambda=None, vf_coef=None,
        ent_coef=None, learning_rate=None, image_based=False, image_pretrain=None,
        verbose=0, w=.1):
    if not batch_size:
        batch_size = choose_hyperp("batch_size", 10, w=w)
    if not n_steps:
        n_steps = max(batch_size, choose_hyperp("n_steps", 10, w=w))//num_envs
    if not n_epochs:
        n_epochs = choose_hyperp("n_epochs", 2, w=w)
    if not clip_range:
        clip_range = choose_hyperp("clip_range", 1, w=w)
    if not gamma:
        gamma = choose_hyperp("gamma", 2, w=w)
    if not gae_lambda:
        gae_lambda = choose_hyperp("gae_lambda", 1, w=w)
    if not vf_coef:
        vf_coef = choose_hyperp("vf_coef", 0, w=w)
    if not ent_coef:
        ent_coef = choose_hyperp("ent_coef", 0, w=w)
    if not learning_rate:
        learning_rate = choose_hyperp("learning_rate", 5, w=w)
        
    feature_extractor = "MlpPolicy"
    if image_based:
        feature_extractor = "CnnPolicy"
    
    model = PPO(feature_extractor, env, batch_size=batch_size, n_steps=n_steps, 
                n_epochs=n_epochs, clip_range=clip_range, gamma=gamma, gae_lambda=gae_lambda,
                vf_coef=vf_coef, ent_coef=ent_coef, learning_rate=learning_rate, verbose=verbose)
    if image_based and image_pretrain:
        model.policy.features_extractor.cnn.load_state_dict(T.load(image_pretrain+"_cnn.pth"))
        model.policy.features_extractor.linear.load_state_dict(T.load(image_pretrain+"_linear.pth"))
    model.save(model_dir + name + '/' + name + "_0")
    return model
コード例 #9
0
def ppo_stable_baselines_training():
    wandb.run = config.tensorboard.run
    wandb.tensorboard.patch(save=False, tensorboardX=True)

    torch.manual_seed(config.seed)
    torch.cuda.manual_seed_all(config.seed)

    envs = make_vec_env(config.env_name, n_envs=config.num_processes)

    model = PPO("CnnPolicy",
                envs,
                verbose=1,
                tensorboard_log="./runs/",
                clip_range=config.clip_param,
                n_steps=50,
                learning_rate=config.lr,
                gamma=config.gamma,
                gae_lambda=config.gae_lambda,
                ent_coef=config.entropy_coef,
                max_grad_norm=config.max_grad_norm,
                vf_coef=config.value_loss_coef,
                batch_size=config.num_mini_batch)
    model.learn(total_timesteps=config.num_steps,
                log_interval=1,
                callback=WandbStableBaselines3Callback())
    model.save(f"{config.env_name}_stable_baselines_ppo")
コード例 #10
0
    def train_rl(self,
                 models_to_train=40,
                 episodes_per_model=100,
                 path='./logs/'):
        # specify the RL algorithm to train (eg ACKTR, TRPO...)

        # Callback for saving the best agent during training
        eval_callback = EvalCallback(self.env,
                                     best_model_save_path=path,
                                     log_path=path,
                                     eval_freq=500,
                                     deterministic=True,
                                     render=False)

        model = PPO(MlpPolicy,
                    self.env,
                    verbose=1,
                    learning_rate=0.0003,
                    tensorboard_log=path)
        start = time.time()

        for i in range(models_to_train):
            steps_per_model = episodes_per_model * self.param.steps_per_episode
            model.learn(total_timesteps=steps_per_model,
                        callback=eval_callback)
            model.save("MODEL_" + str(i))

        end = time.time()
        print("time (min): ", (end - start) / 60)
コード例 #11
0
def run(config: Dict[str, Any], logdir: pathlib.PosixPath):
    env = make_env(config)

    if config["mode"] == "evaluate":
        print("Start evaluation.")
        model = PPO.load(logdir / "model.zip")
    elif config["mode"] == "train" and args.logdir:
        print("Start training from existing model.")
        model = PPO.load(logdir / "model.zip")
        model.set_env(env)
        model.learn(total_timesteps=config["train_steps"])
    else:
        print("Start training.")
        model = PPO(
            "CnnPolicy",
            env,
            verbose=1,
            tensorboard_log=logdir / "tensorboard",
            use_sde=True,
        )
        model.learn(total_timesteps=config["train_steps"])

    mean_reward, std_reward = evaluate_policy(
        model, env, n_eval_episodes=config["eval_eps"], deterministic=True)
    print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

    if config["mode"] == "train":
        model.save(logdir / "model")

    env.close()
コード例 #12
0
def pybullet_example():
    # PyBullet: Normalizing input features

    import pybullet_envs

    env = DummyVecEnv([lambda: gym.make("HalfCheetahBulletEnv-v0")])
    # Automatically normalize the input features and reward.
    env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.0)

    model = PPO("MlpPolicy", env)
    model.learn(total_timesteps=2000)

    # Don't forget to save the VecNormalize statistics when saving the agent.
    log_dir = "/tmp/"
    model.save(log_dir + "ppo_halfcheetah")
    stats_path = os.path.join(log_dir, "vec_normalize.pkl")
    env.save(stats_path)

    # To demonstrate loading.
    del model, env

    # Load the saved statistics.
    env = DummyVecEnv([lambda: gym.make("HalfCheetahBulletEnv-v0")])
    env = VecNormalize.load(stats_path, env)
    # Do not update them at test time.
    env.training = False
    # reward normalization is not needed at test time.
    env.norm_reward = False

    # Load the agent.
    model = PPO.load(log_dir + "ppo_halfcheetah", env=env)
コード例 #13
0
def main():
    num_cpu = 1
    load_version = ''
    save_version = '1b_v0'
    load_dir = '../models'
    save_dir = '../models'
    timesteps_per_checkpoint = int(1e6)
    num_checkpoints = int(1e1)  # controlling performance level of agent

    try:
        os.mkdir(save_dir)
    except OSError as error:
        pass

    alg_env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
    print('created alg env')

    train_policy = 'MlpPolicy'
    load_path = '{}/alg_v{}.zip'.format(load_dir, load_version)
    if os.path.exists(load_path):
        alg = PPO(train_policy, alg_env, verbose=0)
        alg.set_parameters(load_path, exact_match=True)
        # alg = PPO.load(load_path, env=alg_env)
        print('loaded alg checkpoint' + load_path)
    else:
        alg = PPO(train_policy, alg_env, verbose=0)
        print('created alg model')

    save_path = '{}/alg_v{}.zip'.format(save_dir, save_version)
    for _ in range(num_checkpoints):
        alg.learn(total_timesteps=timesteps_per_checkpoint)
        alg.save(save_path)
        print('saved alg checkpoint' + save_path)
コード例 #14
0
def main():
    # Instantiate the env
    env = Gaze(fitts_W=fitts_W,
               fitts_D=fitts_D,
               ocular_std=ocular_std,
               swapping_std=swapping_std)
    env = Monitor(env, log_dir)

    # Train the agent
    model = PPO('MlpPolicy', env, verbose=0, clip_range=0.15)
    '''
    # Save a checkpoint periodically
    save_feq_n=timesteps/10    
    checkpoint_callback = CheckpointCallback(save_freq=save_feq_n, save_path=f'{log_dir}savedmodel/',
        name_prefix='model_ppo')
    '''

    # Train the agent
    model.learn(total_timesteps=int(timesteps), callback=checkpoint_callback)

    # Save the model
    model.save(f'{log_dir}savedmodel/model_ppo')

    # Plot the learning curve
    plot_results2(log_dir)

    save_learned_behaviour()
コード例 #15
0
def main():

    env = Pinokio2()
    # Optional: PPO2 requires a vectorized environment to run
    # the env is now wrapped automatically when passing it to the constructor
    # env = DummyVecEnv([lambda: env])

    if os.path.exists(save_file):
        model = PPO.load(save_file, env=DummyVecEnv([lambda: env]))
    else:
        model = PPO(MlpPolicy, env, verbose=1)

    while True:
        #model.learn(total_timesteps=10000)
        model.learn(total_timesteps=100000)

        model.save(save_file)

        obs = env.reset()
        for i in range(10):
            action, _states = model.predict(obs)
            obs, reward, done, info = env.step(action)
            env.render()
            if done:
                print("resetting because " + str(done))
                env.reset()
コード例 #16
0
ファイル: example_rl.py プロジェクト: wbap/PyLIS
def learn(env_name, save_file, total_timesteps):
    env = DummyVecEnv([lambda: gym.make(env_name)])
    model = PPO(CnnPolicy, env, verbose=1)
    model.learn(total_timesteps=total_timesteps)
    model.save(save_file)
    del model
    env.close()
コード例 #17
0
ファイル: test.py プロジェクト: kdh0429/TorchDeepMimic
def main():
    # multiprocess environment
    # n_cpu = 8
    # env = SubprocVecEnv([lambda: gym.make('DYROSTocabi-v1') for i in range(n_cpu)])
    # env = VecNormalize(env, norm_obs=True, clip_obs=2.0, norm_reward=False, training=True)

    n_cpu = 1
    env = gym.make('DYROSTocabi-v1')
    env = DummyVecEnv([lambda: env])
    env = VecNormalize(env,
                       norm_obs=True,
                       clip_obs=2.0,
                       norm_reward=False,
                       training=True)

    model = PPO('MlpPolicy',
                env,
                verbose=1,
                n_steps=int(4096 / n_cpu),
                wandb_use=False)
    model.learn(total_timesteps=40000000)
    file_name = "ppo2_DYROSTocabi_" + str(datetime.datetime.now())
    model.save(file_name)
    env.save(file_name + "_env.pkl")

    model.policy.to("cpu")
    for name, param in model.policy.state_dict().items():
        weight_file_name = "./result/" + name + ".txt"
        np.savetxt(weight_file_name, param.data)

    np.savetxt("./result/obs_mean.txt", env.obs_rms.mean)
    np.savetxt("./result/obs_variance.txt", env.obs_rms.var)

    del model  # remove to demonstrate saving and loading
    del env

    # file_name = "ppo2_DYROSTocabi_2021-01-08 07:18:00.267089"

    env = gym.make('DYROSTocabi-v1')
    env = DummyVecEnv([lambda: env])
    env = VecNormalize.load(file_name + "_env.pkl", env)
    env.training = False

    model = PPO.load(file_name, env=env, wandb_use=False)

    #Enjoy trained agent
    obs = np.copy(env.reset())
    epi_reward = 0

    while True:
        action, _states = model.predict(obs, deterministic=True)

        obs, rewards, dones, info = env.step(action)
        env.render()
        epi_reward += rewards

        if dones:
            print("Episode Reward: ", epi_reward)
            epi_reward = 0
コード例 #18
0
ファイル: train.py プロジェクト: gkswamy98/pillbox
def train_lunarlander_expert():
    env = make_vec_env('LunarLander-v2', n_envs=16)
    # Used default hyperparams as tuned seemed to not work that well.
    model = PPO('MlpPolicy', env, verbose=1,
                policy_kwargs=dict(net_arch=[64, 64]))
    model.learn(total_timesteps=2e6)
    model.save("experts/LunarLander-v2/lunarlander_expert")
    gen_expert_demos('LunarLander-v2', gym.make('LunarLander-v2'), model, 25)
コード例 #19
0
 def __call__(self):
     policy_kwargs = dict(activation_fn=th.nn.ReLU)
     model = PPO('CnnPolicy',
                 self.env,
                 learning_rate=1e-3,
                 policy_kwargs=policy_kwargs).learn(self.total_time_steps)
     model.save('PPO_' + self.game_name)
     del model   # since the model has been trained, its no longer needed any more...
コード例 #20
0
ファイル: atari.py プロジェクト: Bing-Yuan/fitre
def train_ppo(config):
    print("RUNNING PPO")
    env = get_env()
    callback = ReportCallback()
    # model = PPO('MlpPolicy', env, verbose=1)
    model = PPO('CnnPolicy', env, verbose=1, **config)
    model.learn(total_timesteps=1000000, callback=callback)
    model.save("ppo_pong")
    return
コード例 #21
0
def train():
    """Trains a PPO2 policy."""

    env_args = env_parser.parse_known_args()[0]
    policy_args = policy_parser.parse_known_args()[0]
    opt_args = opt_parser.parse_known_args()[0]

    os.makedirs(opt_args.save_path, exist_ok=True)

    # create environment
    # train_env = GFootballEnv(env_args) # for evaluation
    train_env = DummyVecEnv([
        make_env(env_args, opt_args.save_path, rank=i)
        for i in range(opt_args.num_envs)
    ])
    eval_env = GFootballEnv(env_args)  # for evaluation
    check_env(env=eval_env, warn=True)

    # define rl policy/value network
    policy = getattr(sys.modules[__name__], policy_args.policy)

    # initialize ppo
    tb_dir = os.path.join(opt_args.save_path, "tensorboard")
    os.makedirs(tb_dir, exist_ok=True)
    verbose = 1
    ppo = PPO(policy,
              train_env,
              learning_rate=opt_args.lr,
              n_steps=opt_args.n_steps,
              n_epochs=opt_args.n_epochs,
              gamma=opt_args.gamma,
              gae_lambda=0.95,
              clip_range=opt_args.clip_range,
              clip_range_vf=None,
              ent_coef=opt_args.ent_coef,
              vf_coef=opt_args.vf_coef,
              max_grad_norm=opt_args.max_grad_norm,
              tensorboard_log=tb_dir,
              verbose=verbose,
              seed=opt_args.seed)

    # load initial checkpoint
    if opt_args.load_path:
        ppo.load(os.path.join(opt_args.load_path, "ppo_gfootball.pt"))

    # start training ppo
    eval_dir = os.path.join(opt_args.save_path, "eval")
    os.makedirs(eval_dir, exist_ok=True)
    ppo.learn(opt_args.num_timesteps,
              log_interval=1,
              eval_env=eval_env,
              eval_freq=opt_args.save_interval,
              n_eval_episodes=10,
              eval_log_path=eval_dir)

    # save final checkpoint
    ppo.save(os.path.join(opt_args.save_path, "ppo_gfootball"))
コード例 #22
0
 def train(time_steps, save=False, **params):
     env = PPOAgent.create_env(1)
     model = PPO('CnnPolicy',
                 env,
                 verbose=params.get('verbose', 1),
                 tensorboard_log=TB_LOGS)
     model.learn(total_timesteps=time_steps)
     if save:
         model.save(MODEL_PATH)
コード例 #23
0
ファイル: train.py プロジェクト: gkswamy98/pillbox
def train_cartpole_expert():
    env = make_vec_env('CartPole-v1', n_envs=8)
    model = PPO('MlpPolicy', env, verbose=1,
                n_steps=32, batch_size=256, gae_lambda=0.8, gamma=0.98,
                n_epochs=20, ent_coef=0.0, learning_rate=linear_schedule(0.001),
                clip_range=linear_schedule(0.2), policy_kwargs=dict(net_arch=[64, 64]))
    model.learn(total_timesteps=1e5)
    model.save("experts/CartPole-v1/cartpole_expert")
    gen_expert_demos('CartPole-v1', gym.make('CartPole-v1'), model, 25)
コード例 #24
0
class Agent(object):
    def __init__(self, env, model=None):
        if model:
            self.model = model
        else:
            self.log_dir = "ppo_cnn/" + str(datetime.datetime.now()).replace(
                ":", "-")
            os.makedirs(self.log_dir, exist_ok=True)
            monitor_env = Monitor(env, self.log_dir, allow_early_resets=True)
            vec_env = DummyVecEnv([lambda: monitor_env])
            policy_kwargs = dict(
                features_extractor_class=CustomCNN,
                features_extractor_kwargs=dict(features_dim=256),
                net_arch=[dict(pi=[64, 64], vf=[64, 64])])
            self.model = PPO(CustomCnnPolicy,
                             vec_env,
                             policy_kwargs=policy_kwargs,
                             verbose=1,
                             learning_rate=0.001)

    def function(self, obs, conf):
        import random
        col, _ = self.model.predict(np.array(obs['board']).reshape(
            6, 7, 1))  # TODO: Connect-4 specific so far
        is_valid = (obs['board'][int(col)] == 0)
        if is_valid:
            return int(col)
        else:
            return random.choice([
                col for col in range(config.columns)
                if obs.board[int(col)] == 0
            ])

    def train(self, timesteps):
        self.model.learn(total_timesteps=timesteps)

    def save(self, name: str):
        self.model.save(name)

    def load(self, name: str, env, replace_parameters=None):
        self.log_dir = "ppo_cnn/" + str(datetime.datetime.now()).replace(
            ":", "-")
        os.makedirs(self.log_dir, exist_ok=True)
        monitor_env = Monitor(env, self.log_dir, allow_early_resets=True)
        vec_env = DummyVecEnv([lambda: monitor_env])
        self.model = PPO.load(name,
                              env=vec_env,
                              custom_objects=replace_parameters)

    def plot(self):
        # Plot cumulative reward
        with open(os.path.join(self.log_dir, "monitor.csv"), 'rt') as fh:
            firstline = fh.readline()
            assert firstline[0] == '#'
            df = pd.read_csv(fh, index_col=None)['r']
        df.rolling(window=1000).mean().plot()
        plt.show()
コード例 #25
0
ファイル: train.py プロジェクト: gkswamy98/pillbox
def train_pendulum_expert():
    env = make_vec_env('Pendulum-v0', n_envs=8)
    model = PPO('MlpPolicy', env, verbose=1,
                n_steps=2048, batch_size=64, gae_lambda=0.95, gamma=0.99,
                n_epochs=10, ent_coef=0.0, learning_rate=3e-4,
                clip_range=0.2, policy_kwargs=dict(net_arch=[256, 256]))
    model.learn(total_timesteps=2e6)
    model.save("experts/Pendulum-v0/pendulum_expert")
    gen_expert_demos('Pendulum-v0', gym.make('Pendulum-v0'), model, 25)
コード例 #26
0
def main():
  # Create the callback: check every 1000 steps
  log_dir = 'log'
  callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir)
  num_cpu = 16
  model_stats_path = os.path.join(log_dir, "sac_" + env_name)
  env_stats_path = os.path.join(log_dir, 'sac_LR001.pkl')
  tb_log = 'tb_log'
  videoName = '5M_timesteps_sac'
  tb_log_name = videoName

  if(StartFresh):
        # env = make_vec_env(env_name, n_envs=4)
        # env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
        env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
        env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.)
        env.reset()
        policy_kwargs = {
            'net_arch':[128,64,32],
        }
        model = PPO('MlpPolicy', 
          env, 
          learning_rate = 0.001,
          n_steps=500,
          # batch_size=0,
          # n_epochs=1,
          gamma=0.9,
          policy_kwargs = policy_kwargs, 
          verbose=1, 
          tensorboard_log=tb_log,
          device="auto")
  else:
      env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
      env = VecNormalize.load(env_stats_path, env)
      env.reset()

      
      model = PPO.load(model_stats_path, tensorboard_log=tb_log)
      model.set_env(env)

  if(DoTraining):
    eval_env = make_vec_env(env_name, n_envs=1)
    eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=True, clip_obs=10.)
    eval_env.reset()
    # model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=tb_log)
    model.learn(total_timesteps=25000000, tb_log_name=tb_log_name, reset_num_timesteps=False) #, callback=callback, =TensorboardCallback()

    # Don't forget to save the VecNormalize statistics when saving the agent
    model.save(model_stats_path)
    env.save(env_stats_path)
    
  if(DoVideo):
    # mean_reward, std_reward = evaluate_policy(model, eval_env)
    # print(f"Mean reward = {mean_reward:.2f} +/- {std_reward:.2f}")
    record_video(env_name, model, video_length=2000, prefix='ppo_'+ env_name + videoName)
コード例 #27
0
def train_PPO(env_train, model_name, timesteps=50000):
    """PPO model"""

    start = time.time()
    model = PPO('MlpPolicy', env_train)
    model.learn(total_timesteps=timesteps)
    end = time.time()

    model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}")
    print('Training time (PPO): ', (end - start) / 60, ' minutes')
    return model
コード例 #28
0
def train_ppo(itr=0, timesteps=1e7, use_dummy_video = True):
	env = flappy_env.FlappyEnv(use_dummy_video)
	env = Monitor(env, f"flappy_ppo_{itr}")
	obs = env.reset()
	model = PPO(
		"CnnPolicy", 
		env, 
		verbose=1, 
		learning_rate=1e-5,
		tensorboard_log = f"./ppo_flappy_tensorboard_{itr}/")
	model.learn(total_timesteps = timesteps)
	model.save(f"ppo_flappy_{itr}")
コード例 #29
0
def train_ppo():

    log_dir = f"model_save/"
    env = ENV(istest=False)
    env = Monitor(env, log_dir)
    env = DummyVecEnv([lambda: env])
    # env = VecNormalize(env, norm_obs=True, norm_reward=True,
    #                clip_obs=10.)
    model = PPO("CnnPolicy", env, policy_kwargs=policy_kwargs, verbose=1, batch_size=2048, seed=1)
    callback = SaveOnBestTrainingRewardCallback(check_freq=480, log_dir=log_dir)
    model.learn(total_timesteps=int(4800), callback = callback, log_interval = 480)
    model.save('model_save/PPO')
コード例 #30
0
def train():
    env = gym.make(TRAIN_ENV)
    env = PovOnlyObservation(env)
    env = ActionShaping(env, always_attack=True)

    # For all the PPO hyperparameters you could tune see this:
    # https://github.com/DLR-RM/stable-baselines3/blob/6f822b9ed7d6e8f57e5a58059923a5b24e8db283/stable_baselines3/ppo/ppo.py#L16
    model = PPO("CnnPolicy", env, verbose=1)
    model.learn(total_timesteps=TRAIN_TIMESTEPS)  # 2m steps is about 8h at 70 FPS
    model.save(TRAIN_MODEL_NAME)

    env.close()