コード例 #1
0
def test_recurrent_eval_callback():
    env_id = 'Pendulum-v0'

    # Create envs
    env = make_vec_env(env_id, n_envs=4)
    eval_env = make_vec_env(env_id, n_envs=1)

    # Create RL model
    model = PPO2('MlpLstmPolicy', env)

    # Stop training if the performance is good enough
    callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-1200,
                                                     verbose=1)

    eval_callback = EvalCallback(eval_env,
                                 callback_on_new_best=callback_on_best,
                                 best_model_save_path=LOG_FOLDER,
                                 log_path=LOG_FOLDER,
                                 eval_freq=100)

    model.learn(300, callback=eval_callback)

    # Cleanup
    if os.path.exists(LOG_FOLDER):
        shutil.rmtree(LOG_FOLDER)
コード例 #2
0
 def create_model(self, n_envs=1):
     """ Create env and agent model """
     env_cls = SprEnv
     self.env = make_vec_env(env_cls,
                             n_envs=n_envs,
                             env_kwargs={"params": self.params},
                             seed=self.params.seed)
     self.model = ACKTR(
         self.policy,
         self.env,
         gamma=self.params.agent_config['gamma'],
         n_steps=self.params.agent_config['n_steps'],
         ent_coef=self.params.agent_config['ent_coef'],
         vf_coef=self.params.agent_config['vf_coef'],
         vf_fisher_coef=self.params.agent_config['vf_fisher_coef'],
         max_grad_norm=self.params.agent_config['max_grad_norm'],
         learning_rate=self.params.agent_config['learning_rate'],
         gae_lambda=self.params.agent_config['gae_lambda'],
         lr_schedule=self.params.agent_config['lr_schedule'],
         kfac_clip=self.params.agent_config['kfac_clip'],
         kfac_update=self.params.agent_config['kfac_update'],
         async_eigen_decomp=self.params.agent_config['async_eigen_decomp'],
         verbose=self.params.agent_config['verbose'],
         tensorboard_log="./tb/acktr/",
         seed=self.params.seed,
         policy_kwargs={"params": self.params})
コード例 #3
0
def get_intrinsic_reward(base_index):
    intrinsic_rewards = [[] for _ in range(len(subenv_dict))]
    # base env
    base_name = subenv_dict[base_index]
    base_env = make_vec_env(f"selected-bipedal-{base_name}-v0",
                            n_envs=1,
                            seed=seed)
    base_agent = ACKTR.load(f"./base_agent/{base_name}/model.zip")

    # rnd model
    rnd_dict = {}
    for client_env in subenv_dict.values():
        rnd = RandomNetworkDistillation(input_size=24)
        rnd.load(f"./base{base_index}_client_model/{client_env}/rnd")
        rnd_dict[client_env] = rnd
    obs = base_env.reset()
    for _ in range(num_test):
        for i, client_env in subenv_dict.items():
            intrinsic_rewards[i].append(
                rnd_dict[client_env].get_intrinsic_reward(obs))
        action = base_agent.predict(obs)
        obs, reward, done, info = base_env.step(action[0])
        if done:
            obs = base_env.reset()
    return intrinsic_rewards
コード例 #4
0
ファイル: PPO.py プロジェクト: ai4ce/SNAC
def main_exp(arg):
    env = DMP_simulator_3d_dynamic_triangle.deep_mobile_printing_3d1r(
        plan_choose=arg["plan_choose"])
    env = make_vec_env(lambda: env, n_envs=1)
    policy_kwargs = dict(act_fun=tf.nn.tanh, net_arch=[512, 512, 512])
    model = PPO2(MlpPolicy,
                 env,
                 policy_kwargs=policy_kwargs,
                 gamma=arg["gamma"],
                 n_steps=arg["n_steps"],
                 noptepochs=arg["noptepochs"],
                 ent_coef=arg["ent_coef"],
                 learning_rate=arg["learning_rate"],
                 vf_coef=arg["vf_coef"],
                 cliprange=arg["cliprange"],
                 nminibatches=arg["nminibatches"],
                 verbose=1,
                 tensorboard_log=arg["tensorboard_log"],
                 n_cpu_tf_sess=1,
                 seed=arg["seed"])
    time_steps = 1e7
    model.learn(total_timesteps=int(time_steps),
                tb_log_name=arg["tb_log_name"])
    model.save(arg["model_save_path"])
    return model
コード例 #5
0
def make_alrs_env(args, test=False, baseline=False):
	"""
	Make a new ALRS environment with parameters specified as command line arguments.
	"""
	from environment import AdaptiveLearningRateOptimizer

	env = make_vec_env(
        env_id=AdaptiveLearningRateOptimizer,
        n_envs=1 if test else args.num_envs,
        env_kwargs={
			'dataset': args.dataset,
			'architecture': args.architecture,
            'batch_size': args.batch_size,
            'update_freq': args.update_freq,
            'num_train_steps': args.num_train_steps,
            'initial_lr': args.initial_lr,
            'discrete': args.discrete,
            'action_range': np.inf if baseline else args.action_range,
			'lr_noise':  not (test or baseline)
        }
    )
	env = VecNormalize(
        venv=env,
        norm_obs=args.ppo2_norm_obs,
        norm_reward=args.ppo2_norm_reward,
        clip_obs=args.ppo2_cliprange if args.ppo2_cliprange > 0 else 10,
        clip_reward=args.ppo2_cliprange if args.ppo2_cliprange > 0 else 10,
        gamma=args.ppo2_gamma
    )
	env.alrs = env.venv.envs[0].env

	return env
コード例 #6
0
def create_env(n_envs, env_name=None, log_dir=None):
    return VecNormalize(make_vec_env(ENVS[env_name][env_id],
                                     n_envs=n_envs,
                                     env_kwargs=ENVS[env_name][env_kwargs],
                                     monitor_dir=log_dir),
                        norm_obs=False,
                        norm_reward=True)
def run_stable_baselines(
    reward_config_file,
    hysr_one_ball_config_file,
    ppo_config_file,
    log_episodes=False,
    log_tensorboard=False,
):

    from stable_baselines.common.policies import MlpPolicy
    from stable_baselines.common import make_vec_env
    from stable_baselines import PPO2

    env_config = {
        "reward_config_file": reward_config_file,
        "hysr_one_ball_config_file": hysr_one_ball_config_file,
        "log_episodes": log_episodes,
        "log_tensorboard": log_tensorboard,
    }
    env = make_vec_env(HysrOneBallEnv, env_kwargs=env_config)

    ppo_config = PPOConfig.from_json(ppo_config_file)
    if log_tensorboard:
        model = PPO2(MlpPolicy,
                     env,
                     verbose=1,
                     log_tensorboard=log_tensorboard,
                     **ppo_config)
    else:
        model = PPO2(MlpPolicy, env, verbose=1, **ppo_config)
    model.learn(total_timesteps=1000000)
    model.save("ppo2_hysr_one_ball")
コード例 #8
0
def train(timesteps=TIMESTEPS):
    print(
        f"[INFO] STARTING TRAINING: {START_TIME} {ENVIRONMENT}-{POLICY_NAME}-{ALGO}"
    )
    print(f"[INFO] NETWORK ARCH {NETWORK_ARCH}")

    # use vectorized environments for the appropriate algorithms for a speed boost
    env = make_vec_env(ENVIRONMENT, NUM_ENVS)
    # the network architecture can be defined above for any policy
    policy_kwargs = dict(net_arch=NETWORK_ARCH)
    model = PPO2(policy=POLICY,
                 env=env,
                 verbose=0,
                 policy_kwargs=policy_kwargs,
                 tensorboard_log=TENSORBOARD_DIR,
                 n_steps=1,
                 learning_rate=LEARNING_RATE)
    if LOAD_MODEL:
        model.load(load_path=LOAD_DIR)
    print(f"[INFO] Training for TIMESTEPS {TIMESTEPS}")

    model.learn(total_timesteps=timesteps,
                log_interval=LOG_INTERVAL,
                tb_log_name=TB_LOG_NAME)  # experiment select
    print("[INFO] Done training")

    model.save(save_path=MODEL_DIR, cloudpickle=False)
    print(f"[INFO] MODEL SAVED TO {MODEL_DIR}")

    return 0
コード例 #9
0
def fed_and_eval(base_index, w):
    base_env = make_vec_env(f"selected-bipedal-{subenv_dict[base_index]}-v0",
                            n_envs=1,
                            seed=seed)
    base_agent = ACKTR.load(
        f"./base_agent/{subenv_dict[base_index]}/model.zip")
    base_parameter_dict = base_agent.get_parameters()

    sub_model_parameters = []
    for subenv in subenv_dict.values():
        client_policy = ACKTR.load(
            f"./base{base_index}_client_model/{subenv}/policy.zip")
        sub_model_parameters.append(client_policy.get_parameters())

    aligned_agent = base_agent
    base_parameter_dict = aligned_agent.get_parameters()

    model_align(w, base_parameter_dict, sub_model_parameters, alpha=alpha)

    aligned_agent.load_parameters(base_parameter_dict)
    avg_reward, reward_std = evaluate_policy(aligned_agent,
                                             base_env,
                                             n_eval_episodes=100)

    print(f"base {base_index}, weight {w} done")
    return (avg_reward, reward_std)
コード例 #10
0
def test(args):

    print("testing the trained environment")

    env_info = {
        "args": args,
        "external_func": ext_func_list,
        "params": request_params
    }
    env = make_vec_env(tradingEnv.TradingEnvironment,
                       n_envs=args.num_envs,
                       env_kwargs={"env_info": env_info})

    #Constants for saving logs and models
    exp_name = args.exp_name
    save_dir = os.path.join(BASE_PATH, 'logs_models', exp_name)

    model_name = os.path.join(save_dir, "Trading_exp_1_finished")

    model = PPO2.load(model_name)

    obs = env.reset()
    #Test for n steps
    for i in range(1000):
        # test_model_load.start_innvestigate(new_model, obs)
        action, _states = model.predict(obs, deterministic=True)
        obs, rewards, dones, info = env.step(action)

        if dones:
            print("RESET")
            if args.visualize:
                env.render()
コード例 #11
0
def createVectorizedEnv():
    # Aguarda scripts iniciarem.
    # Fluxo sai de H1 e vai para H2
    env = LoadBalanceEnvDiscAction(source_port_index=0, source_switch_index=0, target_port_index=0, target_switch_index=2)
    env = make_vec_env(lambda: env, n_envs=1)

    return env
コード例 #12
0
def run_stable(num_steps, save_dir):
    env = make_vec_env(BBall3Env,
                       n_envs=1,
                       monitor_dir=save_dir,
                       env_kwargs=env_config)
    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                     sigma=0.5 * np.ones(n_actions))

    model = TD3(
        MlpPolicy,
        env,
        action_noise=action_noise,
        verbose=1,
        gamma=0.99,
        buffer_size=1000000,
        learning_starts=10000,
        batch_size=100,
        learning_rate=1e-3,
        train_freq=1000,
        gradient_steps=1000,
        policy_kwargs={"layers": [64, 64]},
        n_cpu_tf_sess=1,
    )

    num_epochs = 1
    total_steps = 5e5

    for epoch in range(num_epochs):
        model.learn(total_timesteps=int(total_steps / num_epochs))
        model.save(save_dir + "/model.zip")
コード例 #13
0
ファイル: run_ppo.py プロジェクト: aditya-attawar/bball
def run_stable(num_steps, save_dir):
    env = make_vec_env(BBall3Env,
                       n_envs=8,
                       monitor_dir=save_dir,
                       env_kwargs=env_config)
    #    env = VecNormalize(env)

    model = PPO2(
        MlpPolicy,
        env,
        verbose=1,
        seed=int(seed),
        # normalize = True
        # policy = 'MlpPolicy',
        n_steps=2048,
        nminibatches=32,
        lam=0.95,
        gamma=0.99,
        noptepochs=10,
        ent_coef=0.0,
        learning_rate=2.5e-4,
        cliprange=0.2,
        cliprange_vf=-1,
    )

    num_epochs = 5

    for epoch in range(num_epochs):

        model.learn(total_timesteps=int(num_steps / num_epochs))
        model.save(save_dir + "/model.zip")
コード例 #14
0
ファイル: ppo2.py プロジェクト: xclmj/deepwell
 def retrain(self, env, timesteps, modelpath, tensorboard_logs_path):
     model = self.load(modelpath, tensorboard_logs_path)
     env_str = self.get_env_str(env)
     model.set_env(make_vec_env(env_str, n_envs=8))
     model.learn(total_timesteps=timesteps, reset_num_timesteps=False, tb_log_name="TB_"+datetime.now().strftime('%d%m%y-%H%M'))      #Continue training
     model.save(modelpath)
     return model
コード例 #15
0
ファイル: jupong2d_ppo2.py プロジェクト: RCX112/Pong-DeepRL
 def __init__(self, env, output, train_steps, total_time_steps, session, paddle_length_factor=None, paddle_speed_factor=None,
              ball_speed_factor=None):
     """
     The constructor of the class 'JuPong2D_PPO2' creates a vectorized Gym-Environment with a specific parameter set.
     The neuronal networks will be saved in the output folder after every 'total_time_steps' step. For a more
     accurate training, a parameter 'session' will be used to train the same model multiple times.
     :param env: The Gym-Environment to load
     :param output: The output folder for the neuronal networks
     :param total_time_steps: Training duration before saving
     :param session: Session-ID for a specific training configuration
     :param paddle_length_factor: Factor for the paddle length
     :param paddle_speed_factor: Factor for the paddle speed
     :param ball_speed_factor: Factor for the ball speed
     """
     self.train_steps = train_steps
     self.total_time_steps = total_time_steps
     self.env_name = env
     self.session = session
     self.env = make_vec_env(self.env_name, n_envs=4)
     self.output = output
     self.paddle_length_factor = paddle_length_factor
     self.paddle_speed_factor = paddle_speed_factor
     self.ball_speed_factor = ball_speed_factor
     self.save_name = "stablebl_ppo2_save"
     self.create_save_folder()
     self.make_save_path()
     self.create_model()
コード例 #16
0
def train_with_forward_search(env_name, pop_size, total_timesteps, train_timesteps, LOGS, FORWARD_SEARCH_MODEL, args, seed):
    env = make_vec_env(env_name, n_envs=args.n_envs)
    model = PPO(MlpPolicy, env, n_steps=args.n_steps, nminibatches=args.nminibatches, noptepochs=args.noptepochs, ent_coef=args.ent_coef, learning_rate=args.learning_rate, lam=args.lam, gamma=args.gamma, cliprange=args.cliprange, cliprange_vf=args.cliprange_vf)
#     model = PPO(MlpPolicy, env, seed=seed)
    timesteps = []
    mean_reward = []
    std_reward = []

    if os.path.exists(os.path.join(LOGS, 'forward_search_train_stats.npz')):
        train_stats = np.load(os.path.join(LOGS, 'forward_search_train_stats.npz'))
        pid = int(train_stats['pid'])
        completed_steps = int(train_stats['completed_steps'])
        model = PPO.load(FORWARD_SEARCH_MODEL, env=env, pid=pid)
        print("Loading forward search model with pid:{}, completed_steps:{}".format(pid, completed_steps))
    else:
        pid = os.getpid()
        completed_steps = 0

    epochs = total_timesteps//train_timesteps
    model.save(FORWARD_SEARCH_MODEL, pid=pid)

    print("Running forward search with population size: {}, epochs: {}".format(pop_size, epochs))
    print("PID:{}".format(pid))
    
    for epoch in range(completed_steps // train_timesteps, total_timesteps//train_timesteps):
        with mp.get_context("spawn").Pool(pop_size) as pool:
            pooled_results = pool.starmap(forward_search,
                        ((train_timesteps, env_name, args.n_envs, FORWARD_SEARCH_MODEL, seed, pid)
                            for _ in range(pop_size)))


        pooled_results = np.array(pooled_results)
        models_parameters = pooled_results[:, 0]
        process_ids = pooled_results[:, 1]
        mean_rewards = pooled_results[:, 2]
        std_rewards = pooled_results[:, 3]

#         for idx in range(pooled_results.shape[0]):
#             _, pid, mean, std = pooled_results[idx]
#             print(pid, mean, std)

        ind = np.argmax(mean_rewards)
        print("Epoch:{} Best child index from population: {}, Mean Reward:{}, Std Reward:{}".format(epoch + 1, ind, mean_rewards[ind], std_rewards[ind]))

        model = PPO.load(FORWARD_SEARCH_MODEL, env=env, pid=process_ids[ind])
        model.load_parameters(models_parameters[ind], exact_match=True)
        model.save(FORWARD_SEARCH_MODEL, pid=pid)
        
        timesteps.append((epoch + 1) * train_timesteps)
        mean_reward.append(mean_rewards[ind])
        std_reward.append(std_rewards[ind])

        plot_reward(np.array(timesteps), np.array(mean_reward), np.array(mean_reward) - np.array(std_reward), np.array(mean_reward) + np.array(std_reward), figname=os.path.join(LOGS, 'fsepoch{}.png'.format(epoch + 1)))

        with open(os.path.join(LOGS, 'forward_search.csv'), 'a') as f:
            csvwriter = csv.writer(f, delimiter=',')
            csvwriter.writerow([epoch + 1, ind, mean_rewards[ind], std_rewards[ind], pooled_results[:, 1:]])
    
        np.savez_compressed(os.path.join(LOGS, 'train_stats.npz'), timesteps=timesteps, mean_reward=mean_reward, std_reward=std_reward, pid=pid, completed_steps=((epoch + 1) * train_timesteps))
コード例 #17
0
def train():
    env = make_vec_env('My-CartPole-v0', n_envs=1)

    model = PPO2(MlpPolicy, env, verbose=0)
    n = 250000
    model.learn(total_timesteps=n)

    model.save("./weights/ppo2_cartpole" + str(n))
コード例 #18
0
def parse_hyperparams(args):
    storage_name = f"sqlite:///tuning_studies/{args.study_name}.db"
    study = optuna.load_study(study_name=args.study_name, storage=storage_name)
    trial = study.best_trial
    params = trial.params
    # Make the environment depending on number of environments in params
    try:
        env = make_vec_env(
            lambda: gym.make(args.env, **args.env_kwargs),
            n_envs=params["n_envs"],
        )
        params.pop("n_envs")
    except:
        env = gym.make(args.env)
    # Constructing the network architecture
    # Mapping net_arch to actual network architectures for SB
    net_arch = {
        "small": dict(pi=[64, 64], vf=[64, 64]),
        "med": dict(pi=[256, 256], vf=[256, 256]),
        "large": dict(pi=[400, 400], vf=[400, 400]),
    }[params["net_arch"]]
    # Creating a custom LSTM policy

    class CustomLSTMPolicy(LstmPolicy):
        def __init__(
            self,
            sess,
            ob_space,
            ac_space,
            n_env,
            n_steps,
            n_batch,
            n_lstm=params["n_lstm"],
            reuse=False,
            **_kwargs,
        ):
            super().__init__(
                sess,
                ob_space,
                ac_space,
                n_env,
                n_steps,
                n_batch,
                n_lstm,
                reuse,
                net_arch=[100, "lstm", net_arch],
                layer_norm=True,
                feature_extraction="mlp",
                **_kwargs,
            )

    # Deleting keys that can't be used in SB models
    keys_to_delete = ["batch_size", "n_lstm", "net_arch", "joker"]
    if "lambda" in params:
        keys_to_delete.append("lambda")
        params["lam"] = params["lambda"]
    [params.pop(key) for key in keys_to_delete if key in params]
    return params, CustomLSTMPolicy, env
コード例 #19
0
 def _setup(self):
     # Game parameters
     self.env = make_vec_env(self.ENV_NAME, n_envs=self.num_envs)
     self.env.play_type = PLAY_TYPE.MACHINE
     self.env.render_mode = 'machine'
     self.env.MAX_TURNS = self.max_turns
     self.env.reset()
     # Report success
     print('Created new environment {0} with GameID: {1}'.format(self.ENV_NAME, self.GAME_ID))
コード例 #20
0
def train():
	make_env_def()
	# multiprocess environment
	env = make_vec_env('AI4U-v0', n_envs=8)
	model = PPO2(CustomPolicy, env, verbose=1, n_steps=32, nminibatches=4, tensorboard_log="./logs/")
	model.learn(total_timesteps=1000000)
	model.save("ppo2_model")

	del model # remove to demonstrate saving and loading
コード例 #21
0
def rendu(fichier):

    env = make_vec_env('CartPoleSwingUpContinuous-v0', n_envs=1)
    model = PPO2.load(fichier, cloudpickle=False)

    obs = env.reset()
    for _ in range(1000000):
        sleep(0.009)
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
コード例 #22
0
def rendu():
    env = make_vec_env('My-CartPole-v0', n_envs=1)

    model = PPO2.load("./weights/ppo2_cartpole250000.zip")

    obs = env.reset()
    while True:
        sleep(0.009)
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
コード例 #23
0
def train(args):
    #Using Stable Baselines
    """
        Train the algorithm (with a given policy)
        """

    env_info = {
        "args": args,
        "external_func": ext_func_list,
        "params": request_params
    }
    env = make_vec_env(tradingEnv.TradingEnvironment,
                       n_envs=args.num_envs,
                       env_kwargs={"env_info": env_info})
    #env = VecFrameStack(env, n_stack = 4)
    #Uncomment to enable visualizations!
    print("Vectorized env created")
    print("Creating model")

    #Constants for saving logs and models
    exp_name = args.exp_name
    save_dir = os.path.join(BASE_PATH, 'logs_models', exp_name)

    # Create PPO2 model now
    model = PPO2(MlpPolicy,
                 env,
                 verbose=1,
                 tensorboard_log=save_dir,
                 full_tensorboard_log=False)

    # Train the model and save the results
    print("Training the network")
    try:
        #train for first 1 million Epochs
        steps_per_batch, num_envs = model.n_steps, env.num_envs

        model.learn(total_timesteps=args.num_epochs,
                    tb_log_name=exp_name,
                    log_interval=10)

        print("First training done")
        model.save(save_path=os.path.join(BASE_PATH, 'logs_models', exp_name,
                                          exp_name + '_finished'))

    except Exception as e:
        print("Exception occured during training", e)
        model_name = os.path.join(save_dir, "PPO2_error")
        model.save(model_name)
        import traceback
        traceback.print_exc()

    print("model saved")

    return
コード例 #24
0
def train(dest_path="ppo2model", logdir='./logs/', pretrainedmodel=None, nsteps = 1, total_timesteps=10000, n_envs=1, verbose=1, nminibatches=4):
	model = None
	make_env_def()
	env = make_vec_env('AI4U-v0', n_envs=n_envs)
	if pretrainedmodel is not None:
		model = PPO2.load(pretrainedmodel, policy=CustomPolicy, tensorboard_log=logdir, nminibatches=nminibatches)
		model.set_env(env)
	else:
		model = PPO2(CustomPolicy,env, verbose=verbose, nminibatches=nminibatches, n_steps=nsteps, tensorboard_log=logdir)
	model.learn(total_timesteps=total_timesteps, reset_num_timesteps=False, tb_log_name=logdir)
	model.save(dest_path)
def run_openai_baselines(
    reward_config_file,
    hysr_one_ball_config_file,
    ppo_config_file,
    log_episodes=False,
    log_tensorboard=False,
    model_file_path=None,
):
    import tensorflow as tf
    from stable_baselines.common import make_vec_env

    env_config = {
        "reward_config_file": reward_config_file,
        "hysr_one_ball_config_file": hysr_one_ball_config_file,
        "log_episodes": log_episodes,
        "log_tensorboard": log_tensorboard,
    }
    env = make_vec_env(HysrOneBallEnv, env_kwargs=env_config)

    ppo_config = OpenAIPPOConfig.from_json(ppo_config_file)
    total_timesteps = ppo_config["num_timesteps"]
    del ppo_config["num_timesteps"]
    save_path = ppo_config["save_path"]
    del ppo_config["save_path"]

    if ppo_config["activation"] == "tf.tanh":
        ppo_config["activation"] = tf.tanh

    alg = "ppo2"
    learn = get_alg_module_openai_baselines(alg).learn

    # seed = 123
    if model_file_path is None:
        print("total timesteps:", total_timesteps)
        model = learn(
            env=env,
            # seed=seed,
            total_timesteps=total_timesteps,
            **ppo_config)
        model.save("ppo2_openai_baselines_hysr_one_ball")

    else:
        ppo_config["load_path"] = model_file_path
        model = learn(
            env=env,
            # seed=seed,
            total_timesteps=0,
            **ppo_config)

    if save_path:
        model.save(save_path)
        print("model saved to", save_path)

    return model, env
コード例 #26
0
ファイル: jupong2d_ppo2.py プロジェクト: RCX112/Pong-DeepRL
    def process_environment(self, ind, paddle_length=None, paddle_speed=None, ball_speed=None):
        """
        This method will be executed by multiple threads. It measures the quality of a neuronal network by analyzing
        different parameter values of the Gym-Environment JuPong2D. The results are mean-return-values, which will be
        saved in a csv-file.
        :param ind: Thread index for a scale factor
        :param paddle_length: Factor for the paddle length
        :param paddle_speed: Factor for the paddle speed
        :param ball_speed: Factor for the ball speed
        """
        env = make_vec_env(self.env_name, n_envs=4)

        if paddle_length is not None:
            for gym_env in env.envs:
                gym_env.scale_paddle_height(paddle_length)
            print(f"Paddle Length {paddle_length}")
        elif paddle_speed is not None:
            for gym_env in env.envs:
                gym_env.scale_paddle_vel(paddle_speed)
            print(f"Paddle Speed {paddle_speed}")
        elif ball_speed is not None:
            for gym_env in env.envs:
                gym_env.scale_ball_velocity(ball_speed)
            print(f"Ball Speed {ball_speed}")
        else:
            print("Kein Parameter gesetzt.")
            return

        obs = env.reset()
        return_vals = np.array([0.0, 0.0, 0.0, 0.0])
        return_val_arr = []
        done_cnt = 0
        
        for _ in range(self.play_steps):
            while True:
                action, _states = self.model.predict(obs)
                obs, rewards, dones, info = env.step(action)
                return_vals += rewards

                for i, done in enumerate(dones):
                    if done:
                        done_cnt += 1
                        return_val_arr.append(return_vals[i])
                        return_vals[i] = 0.0
                        self.return_arr[ind] = np.mean(return_val_arr)

                        print(self.return_arr)
                        with open(self.save_file, 'w') as my_file:
                            writer = csv.writer(my_file)
                            writer.writerow(self.scale_factor_arr)
                            writer.writerow(self.return_arr)
                            
                if done_cnt >= 4:
                    break
コード例 #27
0
def forward_search(trained_timesteps, env_name, n_envs, save_file, seed, pid):
    env = make_vec_env(env_name, n_envs=n_envs)

    model = PPO.load(save_file, env=env, pid=pid)

    model = model.learn(trained_timesteps, tb_log_name="PPO", reset_num_timesteps=True)
    mean, std = test(model, env_name)
    pid = os.getpid()
    model.save(save_file, pid=pid)
    env.close()

    return [model.get_parameters(), pid, mean, std]
コード例 #28
0
def launchAgent(model_name: str):
    """
    :param model_name: 실행시킬 모델의 종류. HER, DDPG, PPO2 혹은 기타값(DQN)이어야 함
                        현재는 의도상 PPO2로 세팅할 것
    :return: 1000회의 사이클을 돌고 난 이후의 모델
    """
    import Reinforcement_AI.env.e_enhanced_image_env as image_env
    from stable_baselines import DQN, HER, DDPG, PPO2
    from stable_baselines.common import make_vec_env

    print("Current Env is " + model_name)

    if model_name == "HER":
        env = image_env.DetailedMiniMapEnv()
        model = HER("CnnPolicy", env=env, model_class=DQN)
    if model_name == "DDPG":
        env = image_env.DDPGImageEnv()
        model = DDPG(policy="CnnPolicy", env=env, normalize_observations=True)
    if model_name == "PPO2":
        env = make_vec_env(image_env.DetailedMiniMapEnv, n_envs=1)
        model = PPO2(policy="CnnPolicy", env=env, verbose=1)
    else:
        env = image_env.DetailedMiniMapEnv()
        model = DQN(
            "CnnPolicy",  # policy
            env=env,  # environment
            double_q=True,  # Double Q enable
            prioritized_replay=True,  # Replay buffer enabled
            verbose=0  # log print
        )

    for i in range(1000):
        if i != 0:
            if model_name == "HER":
                model = HER.load("detailedmap_HER_" + str(i), env)
            if model_name == "DDPG":
                model = DDPG.load("detailedmap_DDPG_" + str(i), env)
            if model_name == "PPO2":
                model = PPO2.load("detailedmap_PPO2_" + str(i), env)
            else:
                model = DQN.load("detailedmap_DQN_" + str(i), env)

        # print('model learn start')
        model.learn(total_timesteps=12500)  #FPS가 130이상 넘어갈때의 최소수치
        print("this model is : detailedmap_" + model_name + "_" + str(i + 1))
        # print('model learn finished')

        # print('model save start')
        model.save("detailedmap_" + model_name + "_" + str(i + 1))
        del model
        # print('model save end')

    return model
コード例 #29
0
ファイル: ppo2.py プロジェクト: xclmj/deepwell
    def retrain(self, env, timesteps, modelpath, tensorboard_logs_path):
        #Periodically evalute agent, save best model
        eval_callback = EvalCallback2(env, best_model_save_path='app/model_logs/', 
                        log_path='app/model_logs/', eval_freq=1000,
                        deterministic=True, render=False) 

        model = self.load(modelpath, tensorboard_logs_path)
        env_str = self.get_env_str(env)
        model.set_env(make_vec_env(env_str, n_envs=8))
        model.learn(total_timesteps=timesteps, callback=eval_callback, reset_num_timesteps=False, tb_log_name="TB_"+datetime.now().strftime('%d%m%y-%H%M'))      #Continue training
        model.save(modelpath)
        return model
コード例 #30
0
def test():
	make_env_def()
	# multiprocess environment
	env = make_vec_env('AI4U-v0', n_envs=8)
	
	model = PPO2.load("ppo2_model_baked", policy=CustomPolicy, tensorboard_log="./logs/")
	model.set_env(env)
	
	# Enjoy trained agent
	obs = env.reset()
	while True:
		action, _states = model.predict(obs)
		obs, rewards, dones, info = env.step(action)