Beispiel #1
0
def train_agent_with_a2c(load=False):
    from stable_baselines.common.policies import MlpPolicy
    from stable_baselines.common.vec_env import SubprocVecEnv
    from stable_baselines import A2C

    # multiprocess environment
    n_cpu = 4
    env = SubprocVecEnv([lambda: gym.make('F16GCAS-v0') for i in range(n_cpu)])
    env = gym.make("F16GCAS-v0")

    class CustomPolicy(MlpPolicy):
        def __init__(self, *args, **kwargs):
            super(CustomPolicy, self).__init__(*args, **kwargs,
                                               layers=[128, 128])
    if not load:
        model = A2C(env=env, verbose=1, policy=CustomPolicy)
        # model.learn(total_timesteps=1000000)
        ExpData = ExpertDataset("./lqr_export.npz")
        model.pretrain(ExpData, n_epochs=100)
    else:
        model = A2C.load(ROOT+"/trained_models/TDRL/f16/a2c/128_128", env=env)
        with model.graph.as_default():
            for i in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model/pi/'):
                print(i)

    return model
Beispiel #2
0
def train(game, num_timesteps, num_envs, dir_name, model_name,
          prev_model_name):
    dir_name = get_valid_filename(dir_name)
    model_name = get_valid_filename(model_name)

    log_dir = f"logs/{dir_name}/{model_name}-training"
    model_dir = f"models/{dir_name}"
    os.makedirs(log_dir, exist_ok=True)
    os.makedirs(model_dir, exist_ok=True)

    env = make_vec_envs(game, False, num_envs)
    prev_model_path = f"{model_dir}/{prev_model_name}.zip"
    if prev_model_name is not None and os.path.exists(prev_model_path):
        model = A2C.load(prev_model_path, env=env)
        model.tensorboard_log = log_dir
    else:
        model = A2C(policy="MlpPolicy",
                    env=env,
                    gamma=0.8,
                    n_steps=64,
                    learning_rate=0.00025,
                    verbose=1,
                    tensorboard_log=log_dir)
    model.learn(num_timesteps)
    model.save(f"{model_dir}/{model_name}.zip")
    env.close()
Beispiel #3
0
def train_a2c(seed):
    """
    test A2C on the uav_env(cartesian,discrete) 
    :param seed: (int) random seed for A2C
    """
    """
    A2C(policy, env, gamma=0.99, n_steps=5, vf_coef=0.25, ent_coef=0.01, 
    max_grad_norm=0.5, learning_rate=0.0007, alpha=0.99, epsilon=1e-05,
    lr_schedule='linear', verbose=0,tensorboard_log=None, _init_setup_model=True)
    """
    algo = 'A2C'
    num_timesteps = 3000000

    env = set_up_env(seed)

    global best_mean_reward, n_steps
    best_mean_reward, n_steps = -np.inf, 0

    model = A2C(policy=MlpPolicy, env=env, gamma=0.99, n_steps=5, vf_coef=0.25,
                ent_coef=0.01, max_grad_norm=0.5, learning_rate=0.0007, alpha=0.99,
                epsilon=1e-05, lr_schedule='linear', verbose=0,
                tensorboard_log="./logs/{}/tensorboard/{}/".format(EXPERIMENT_NATURE, algo))

    model.learn(total_timesteps=num_timesteps, callback=callback, seed=seed,
                log_interval=500, tb_log_name="seed_{}".format(seed))

    model = A2C.load(log_dir + 'best_model.pkl')

    evaluation = evaluate_model(env, model, 100)
    os.makedirs('./logs/{}/csv/{}/'.format(EXPERIMENT_NATURE, algo), exist_ok=True)
    os.rename('/tmp/gym/monitor.csv', "./logs/{}/csv/{}/seed_{}.csv".format(EXPERIMENT_NATURE, algo, seed))
    env.close()
    del model, env
    gc.collect()
    return evaluation
Beispiel #4
0
def _train(env_id, agent, model_params, total_steps, is_evaluation=False):
    if is_evaluation:  # evaluate_policy() must only take one environment
        envs = SubprocVecEnv([make_env(env_id)])
    else:
        envs = SubprocVecEnv([make_env(env_id) for _ in range(NUM_CPU)])
    envs = VecNormalize(
        envs)  # normalize the envs during training and evaluation

    # Load pretrained model during training.
    if not is_evaluation and os.path.exists(agent + '_' + env_id):
        if agent == 'ppo2':
            model = PPO2.load(agent + '_' + env_id)
        elif agent == 'a2c':
            model = A2C.load(agent + '_' + env_id)
    else:
        if agent == 'ppo2':
            model = PPO2(MlpLstmPolicy,
                         envs,
                         nminibatches=1,
                         verbose=1,
                         **model_params)
        elif agent == 'a2c':
            model = A2C(MlpLstmPolicy, envs, verbose=1, **model_params)

    model.learn(total_timesteps=total_steps)
    return envs, model
Beispiel #5
0
def attention_render(model_name, env_name, num_cpu, log_dir):
    if not os.path.exists(log_dir):
        raise ('log_dir not Exists')

    env_id = env_name + 'NoFrameskip-v4'
    env = SubprocVecEnv([make_env(env_id, i, log_dir) for i in range(num_cpu)])
    # env = Monitor(env, log_dir, allow_early_resets=True)

    if model_name == 'A2C_Attention':
        model = A2C(AttentionPolicy,
                    env,
                    verbose=1,
                    tensorboard_log=log_dir + 'tensorboard/')
    elif model_name == 'A2C_Attention2':
        model = A2C(Attention2Policy,
                    env,
                    verbose=1,
                    tensorboard_log=log_dir + 'tensorboard/')
    elif model_name == 'A2C':
        model = A2C(LstmPolicy,
                    env,
                    verbose=1,
                    tensorboard_log=log_dir + 'tensorboard/')
    else:
        model = None
    model = model.load(log_dir + model_name + '_' + env_name, env=env)

    obs = env.reset()
    # print(env.observation_space)
    # cv2.imshow('test', RGB2BGR(obs[0]))
    # cv2.waitKey(0)
    while True:
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        attentions = model.get_attention(obs, _states, done)[0]
        attentions_img = []
        # print('attention', np.array(attention).shape)
        for i, attention in enumerate(attentions):
            attention = np.array(attention)
            attention = np.reshape(attention, [
                env.observation_space.shape[0] // 10,
                env.observation_space.shape[1] // 10, 1
            ])
            attention = np.repeat(attention, [10] * attention.shape[0], axis=0)
            attention = np.repeat(attention, [10] * attention.shape[1], axis=1)
            attention = attention * 255
            attentions_img.append(attention)
            # print(np.sum(attention))
        attentions = tile_images(attentions_img)
        cv2.imshow('attention', attentions)
        cv2.waitKey(1)
        # break
        env.render()
    return model
Beispiel #6
0
def train_A2C(start_time_tests    = [31*24*3600, 304*24*3600], 
              episode_length_test = 14*24*3600, 
              load                = False):
    '''Method to train (or load a pre-trained) A2C agent. Testing periods 
    have to be introduced already here to not use these during training. 
    
    Parameters
    ----------
    start_time_tests : list of integers
        Time in seconds from the beginning of the year that will be used 
        for testing. These periods should be excluded in the training 
        process. By default the first day of February and the first day of
        November are used. 
    episode_length_test : integer
        Number of seconds indicating the length of the testing periods. By
        default two weeks are reserved for testing. 
    load : boolean
        Boolean indicating whether the algorithm is loaded (True) or 
        needs to be trained (False)
     
    '''
    excluding_periods = []
    for start_time_test in start_time_tests:
        excluding_periods.append((start_time_test,start_time_test+episode_length_test))
    # Summer period (from June 21st till September 22nd). 
    # Excluded since no heating during this period (nothing to learn).
    excluding_periods.append((173*24*3600, 266*24*3600))  
    
    env = BoptestGymEnvRewardWeightCost(url                   = url,
                                        actions               = ['oveHeaPumY_u'],
                                        observations          = {'reaTZon_y':(280.,310.)}, 
                                        random_start_time     = True,
                                        excluding_periods     = excluding_periods,
                                        max_episode_length    = 1*24*3600,
                                        warmup_period         = 3*3600,
                                        Ts                    = 900)
    
    env = NormalizedObservationWrapper(env)
    env = NormalizedActionWrapper(env)  
    
    model = A2C('MlpPolicy', env, verbose=1, gamma=0.99, seed=seed,
                tensorboard_log=os.path.join('results'))
    
    if not load: 
        model.learn(total_timesteps=int(1e5))
        # Save the agent
        model = A2C.load(os.path.join(utilities.get_root_path(), 'examples',
                                      'agents', 'a2c_bestest_hydronic_heatpump'))
    else:
        # Load the trained agent
        model = A2C.load(os.path.join(utilities.get_root_path(), 'examples',
                                      'agents', 'a2c_bestest_hydronic_heatpump'))
    
    return env, model, start_time_tests
Beispiel #7
0
 def build_model(self):
     if self.is_stack:
         if self.game_type == "box":
             self.env = DummyVecEnv([lambda: self.env])
             self.model = A2C(MlpPolicy, self.env, verbose=0, gamma=self.gamma, learning_rate =self.actor_lr, ent_coef=self.c2,vf_coef=self.critic_lr)
         if self.game_type == "atari":
             self.model = A2C(CnnPolicy, self.env, verbose=0, gamma=self.gamma, learning_rate =self.actor_lr, ent_coef=self.c2,vf_coef=self.critic_lr)
     else:
         if self.game_type == "box":
             self.env = DummyVecEnv([lambda: self.env])
             self.model = A2C(MlpPolicy, self.env, verbose=0, gamma=self.gamma, learning_rate =self.actor_lr, ent_coef=self.c2,vf_coef=self.critic_lr)
         if self.game_type == "atari":
             self.model = A2C(CnnLstmPolicy, self.env, verbose=0, gamma=self.gamma, learning_rate =self.actor_lr, ent_coef=self.c2,vf_coef=self.critic_lr)
def run_baseline(params, LOAD_POLICY, VARIABLE_EVAL):
    # Evaluate the agent
    env = env_fun(animate=params["animate"],
                  max_steps=params["max_steps"],
                  action_input=False,
                  latent_input=False,
                  is_variable=VARIABLE_EVAL)
    policy = A2C('MlpPolicy', env)
    if LOAD_POLICY:
        policy_dir = "agents/xxx.zip"
        policy = A2C.load(policy_dir)  # 2Q5
    regressor = PyTorchMlpCst(env.obs_dim + env.act_dim, 24, env.obs_dim)
    return evaluate_model(params, env, policy, regressor)
Beispiel #9
0
def get_model(model_name, env, log_dir):
    if model_name == "A2C_DualAttention":
        model = A2C(DualAttentionLstmPolicy, env, verbose=1)
    elif model_name == "A2C_SelfAttention_Cin":
        model = A2C(SelfAttentionCinLstmPolicy, env, verbose=1)
    elif model_name == "A2C_SelfAttention":
        model = A2C(SelfAttentionLstmPolicy, env, verbose=1)
    elif model_name == 'A2C_Attention':
        model = A2C(AttentionPolicy,
                    env,
                    verbose=1,
                    tensorboard_log=log_dir + 'tensorboard/')
    elif model_name == 'A2C_Attention2':
        model = A2C(Attention2Policy,
                    env,
                    verbose=1,
                    tensorboard_log=log_dir + 'tensorboard/')
    elif model_name == 'A2C_Attention3':
        model = A2C(Attention3Policy, env, verbose=1)
    elif model_name == 'A2C_Attention4':
        model = A2C(Attention4Policy, env, verbose=1)
    elif model_name == 'A2C':
        model = A2C(CnnLstmPolicy, env, verbose=1)
    else:
        raise ('{} Not Exist'.format(model_name))
    return model
Beispiel #10
0
def load_a2c_model(env, learning_rate, batch_size, algorithm):
    from stable_baselines.common.policies import MlpPolicy
    model = None
    existing_pickle_files = get_files_with_pattern(pickle_dir, 'ppo2_recent_model.pkl')
    
    for file_name in existing_pickle_files:
        search = re.search('ppo2_recent_model.pkl', file_name)
        if search:
            model = A2C.load(file_name, env=env, verbose=0, tensorboard_log=log_dir)
            logger.info("Loading existing pickle file for environment {} with algorithm {} and policy '{}'.".format(env, algorithm, model.policy))
            return model
    
    logger.debug("No pickle was found for environment {}. Creating new model with algorithm {} and policy 'MlpPolicy'...".format(env, algorithm))
    model = A2C(policy='MlpPolicy', env=env, verbose=0, tensorboard_log=log_dir, learning_rate=learning_rate, n_steps = batch_size)
    return model  
Beispiel #11
0
def train():
    """Trains an A2C policy """
    env = create_env()

    model = A2C(
        policy = CnnPolicy,
        env = env,
        gamma = 0.99,
        n_steps = 5,
        vf_coef=0.25, 
        ent_coef=0.01,
        max_grad_norm=0.5,
        learning_rate=7e-4,
        alpha=0.99,
        epsilon=1e-05,
        lr_schedule='constant',
        verbose=1,
        tensorboard_log="./tb"  
    )

    model.learn(
        total_timesteps=int(1e7), 
        callback=callback, 
        tb_log_name="a2c"
    )

    model.save("models/pacman_a2c.pkl")
def load_model(tickers):
    '''Load in the pretrained model from the trained models folder '''
    # model = run_model(tickers,start="2020-01-01T09:30:00-04:00", end="2020-12-31T09:30:00-04:00")
    model = A2C.load(
        "trained_models/2021-03-22 18:25:09.528982/A2C_30k_dow_120.zip")

    return model
Beispiel #13
0
def run_agent(envs, parameters):
    '''Train an agent.'''
    alg = parameters['alg']
    learning_rate = parameters['learning_rate']
    gamma = parameters['gamma']
    model_path = parameters['model_path']
    set_global_seeds(parameters.get('seed'))
    dummy_env = OptVecEnv(envs)
    if alg == 'PPO':
        model = PPO2(MlpPolicy,
                     dummy_env,
                     gamma=gamma,
                     learning_rate=learning_rate,
                     verbose=1,
                     nminibatches=dummy_env.num_envs)
    elif alg == 'A2C':
        model = A2C(MlpPolicy,
                    dummy_env,
                    gamma=gamma,
                    learning_rate=learning_rate,
                    verbose=1)
    else:
        model = DDPG(ddpg.MlpPolicy,
                     dummy_env,
                     gamma=gamma,
                     verbose=1,
                     actor_lr=learning_rate / 10,
                     critic_lr=learning_rate)
    try:
        model.learn(total_timesteps=parameters.get('total_timesteps', 10**6))
    except tf.errors.InvalidArgumentError:
        LOGGER.error('Possible Nan, %s', str((alg, learning_rate, gamma)))
    finally:
        dummy_env.close()
        model.save(str(model_path))
def define_model(env, log_dir):
    if DEFAULT:
        policy_kwargs = dict()
    else:
        policy_kwargs = dict(act_fun=ACT_FUN, net_arch=NET_ARCH)

    if ALGORITHM == 'ppo2':
        model = PPO2(policy=MlpPolicy,
                     env=env,
                     policy_kwargs=policy_kwargs,
                     verbose=0,
                     tensorboard_log=log_dir)

    elif ALGORITHM == 'a2c':
        model = A2C(policy=MlpPolicy,
                    env=env,
                    policy_kwargs=policy_kwargs,
                    verbose=0,
                    tensorboard_log=log_dir)
    else:
        raise Exception('Specify proper algorithm')

    model_arch = model.get_parameter_list()
    print('\n--------------- Summary of archs ---------------')
    for model_param in model_arch:
        print(model_param)
    print('\n')

    return model
Beispiel #15
0
def train(env_id, num_timesteps, seed, policy, lr_schedule, num_env):
    """
    Train A2C model for atari environment, for testing purposes

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...)
    :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant',
                                 'double_linear_con', 'middle_drop' or 'double_middle_drop')
    :param num_env: (int) The number of environments
    """
    policy_fn = None
    if policy == 'cnn':
        policy_fn = CnnPolicy
    elif policy == 'lstm':
        policy_fn = CnnLstmPolicy
    elif policy == 'lnlstm':
        policy_fn = CnnLnLstmPolicy
    if policy_fn is None:
        raise ValueError("Error: policy {} not implemented".format(policy))

    env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4)

    model = A2C(policy_fn, env, lr_schedule=lr_schedule, seed=seed)
    model.learn(total_timesteps=int(num_timesteps * 1.1))
    env.close()
Beispiel #16
0
def main():
    alg_input = input("Select algorithm (PPO2 or A2C only):")
    if alg_input != "PPO2" and alg_input != "A2C" and alg_input != "ppo2" and alg_input != "a2c":
        print("Not an option (PPO2 or A2C only) !")
        alg_input = input("Select algorithm (PPO2 or A2C only):")
    model_input = "trained_agents\\" + input(
        "Select model to test(input filename, eg. a2c_wf_2):")

    env = gym.make("WARFLEET-v0")
    # The algorithms require a vectorized environment to run
    env = DummyVecEnv([lambda: env])
    log_dir = "./logs/"

    done = False
    stage_reward = 0
    turns = 0

    if alg_input == "PPO2" or alg_input == "ppo2":
        model = PPO2.load(model_input, env=env, tensorboard_log=log_dir)
    elif alg_input == "A2C" or alg_input == "a2c":
        model = A2C.load(model_input, env=env, tensorboard_log=log_dir)

    obs = env.reset()

    while not done:
        action, _states = model.predict(obs)
        obs, reward, done, info = env.step(action)
        stage_reward += reward
        turns = turns + 1
        # env.render()

    print("Reward: {} /42".format(stage_reward))
    print("Turns: {}".format(turns))
    env.close()
Beispiel #17
0
def test_a2c_update_n_batch_on_load(tmp_path):
    env = make_vec_env("CartPole-v1", n_envs=2)
    model = A2C("MlpPolicy", env, n_steps=10)

    model.learn(total_timesteps=100)
    model.save(os.path.join(str(tmp_path), "a2c_cartpole.zip"))

    del model

    model = A2C.load(os.path.join(str(tmp_path), "a2c_cartpole.zip"))
    test_env = DummyVecEnv([lambda: gym.make("CartPole-v1")])

    model.set_env(test_env)
    assert model.n_batch == 10
    model.learn(100)
    os.remove(os.path.join(str(tmp_path), "a2c_cartpole.zip"))
Beispiel #18
0
def run_agent(envs, parameters):
    '''Train an agent.'''
    path = Path(parameters['path'])
    dummy_env = OptVecEnv(envs)
    set_global_seeds(parameters.setdefault('seed'))
    save_path = str(path / 'model.pkl')
    alg = parameters['alg']
    if alg == 'PPO':
        with open(save_path, 'rb') as pkl:
            model = PPO2.load(pkl, env=dummy_env)
    elif alg == 'A2C':
        with open(save_path, 'rb') as pkl:
            model = A2C.load(pkl, env=dummy_env)
    try:
        done = False
        observations = dummy_env.reset()
        while not done:
            action = model.predict(observations)
            print(action[0].ravel().tolist())
            observations, rewards, dones, infos = dummy_env.step(action[0])
            done = any(dones)
            info = infos[0]
            yield info['weights']
    finally:
        dummy_env.close()
Beispiel #19
0
def evaluate(modelname, env):
    n_cores = 4
    obs = env.reset()
    model = A2C.load(modelname)
    wr = 0
    win = 0
    total_health_diff = 0
    loss = 0
    episodes = 0
    total_episodes = 100
    while episodes < total_episodes:
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        #print(rewards[0])
        time.sleep(.04)
        # print(rewards[0])
        env.render(mode="human")
        for i in range(4):

            if dones[i] == True:
                if info[i]["p1_health"] < info[i]["p2_health"]:
                    loss += 1
                else:
                    win += 1
                total_health_diff += info[i]["p1_health"] - info[i]["p2_health"]
                wr = win / (win + loss)
                episodes += 1
    return wr, total_health_diff / total_episodes
Beispiel #20
0
def model_training_learning(env_train, model_name, timesteps=100000):

    # train model
    os.chdir("./model_saved/" + model_name)
    start = time.time()
    print("Train ", model_name, " Model with MlpPolicy: ")

    if model_name == "A2C_Model":
        model = A2C('MlpPolicy', env_train, verbose=0)
    elif model_name == "PPO_Model":
        model = PPO2('MlpPolicy', env_train, verbose=0)
    elif model_name == "TD3_Model":
        model = TD3('MlpPolicy', env_train, verbose=0)
    elif model_name == "SAC_Model":
        model = SAC('MlpPolicy', env_train, verbose=0)

    print("Learning ", model_name, " time steps: ", timesteps)

    model.learn(total_timesteps=timesteps)
    print("TD3 Model learning completed: ")
    end = time.time()
    timestamp = time.strftime('%b-%d-%Y_%H%M')
    model_file_name = (model_name + timestamp)
    model.save(model_file_name)
    print("- ", model_name, " save finish     :")
    print("Training time  ", model_name, " : ", (end - start) / 60, " minutes")

    os.chdir("./..")
    os.chdir("./..")
    return model
Beispiel #21
0
def get_a2c(vec_env=None,
            policy='CnnPolicy',
            learning_rate=7e-4,
            momentum=0.0,
            alpha=0.99,
            epsilon=1e-5,
            max_grad_norm=0.5,
            lr_schedule='constant') -> A2C:
    """
    Parameter's default values are taken from stable_baselines.a2c.a2c.py
    """
    if vec_env is None:
        vec_env = create_training_env(1)
    return A2C(policy=policy,
               env=vec_env,
               gamma=0.99,
               n_steps=5,
               vf_coef=0.25,
               ent_coef=0.01,
               max_grad_norm=max_grad_norm,
               learning_rate=learning_rate,
               alpha=alpha,
               momentum=momentum,
               epsilon=epsilon,
               lr_schedule=lr_schedule,
               verbose=2)
Beispiel #22
0
def test_evaluate_policy():
    model = A2C('MlpPolicy', 'Pendulum-v0', seed=0)
    n_steps_per_episode, n_eval_episodes = 200, 2
    model.n_callback_calls = 0

    def dummy_callback(locals_, _globals):
        locals_['model'].n_callback_calls += 1

    _, episode_lengths = evaluate_policy(model,
                                         model.get_env(),
                                         n_eval_episodes,
                                         deterministic=True,
                                         render=False,
                                         callback=dummy_callback,
                                         reward_threshold=None,
                                         return_episode_rewards=True)

    n_steps = sum(episode_lengths)
    assert n_steps == n_steps_per_episode * n_eval_episodes
    assert n_steps == model.n_callback_calls

    # Reaching a mean reward of zero is impossible with the Pendulum env
    with pytest.raises(AssertionError):
        evaluate_policy(model,
                        model.get_env(),
                        n_eval_episodes,
                        reward_threshold=0.0)

    episode_rewards, _ = evaluate_policy(model,
                                         model.get_env(),
                                         n_eval_episodes,
                                         return_episode_rewards=True)
    assert len(episode_rewards) == n_eval_episodes
Beispiel #23
0
def train_agent(train, pickle_file, agent_type, env_kwargs, parms):

    bin_path = "bin/" + pickle_file

    if (path.exists(bin_path)):
        if agent_type == "a2c":
            print("Loading A2C Agent")
            RL_model = A2C.load(
                bin_path,
                tensorboard_log=f"{config.TENSORBOARD_LOG_DIR}/{agent_type}")
        elif agent_type == "ddpg":
            print("Loading DDPG Agent")
            RL_model = DDPG.load(
                bin_path,
                tensorboard_log=f"{config.TENSORBOARD_LOG_DIR}/{agent_type}")
        elif agent_type == "ppo":
            print("Loading PPO2 Agent")
            RL_model = PPO2.load(
                bin_path,
                tensorboard_log=f"{config.TENSORBOARD_LOG_DIR}/{agent_type}")
    else:
        e_train_gym = ipenv.PortfolioAllocEnv(df=train, **env_kwargs)
        env_train, _ = e_train_gym.get_sb_env()

        agent = ipagent.IPRLAgent(env=env_train)

        model = agent.get_model(model_name=agent_type, model_kwargs=parms)

        RL_model = agent.train_model(model=model,
                                     tb_log_name=agent_type,
                                     total_timesteps=1000000)

        RL_model.save(bin_path)

    return RL_model
Beispiel #24
0
    def __init__(self, method, K=5, P=0.95):
        self.method = method
        self.K = K
        self.state_size = self.K + 1
        self.action_size = self.K + 1
        self.reward = []

        env_name = 'ErdosAttack-v0'

        self.log_dir = "/tmp/gym_attack/"
        os.makedirs(self.log_dir, exist_ok=True)

        env = gym.make(env_name)
        env.init_params(K, P)
        env = Monitor(env, self.log_dir, allow_early_resets=True)
        self.envs = DummyVecEnv([lambda: env])

        if method == 'PPO':
            self.model = PPO2(MLP_PPO, self.envs, verbose=0)
        elif method == 'DQN':
            self.model = DQN(MLP_DQN, self.envs, verbose=0)
        elif method == 'A2C':
            self.model = A2C(MLP_A2C, self.envs, verbose=0)
        else:
            raise Exception("Erreur ! Méthode: 'PPO' ou 'DQN' ou 'A2C")
        print("Model Initialized !")

        self.best_mean_reward, self.n_steps = -np.inf, 0
    def test_save_callback(self):
        '''
        Test that the model performance can be monitored and results can be 
        checked and saved as the model improves. This test trains an agent
        for a short period of time, without loading a pre-trained model. 
        Therefore, this test also checks that a RL from stable-baselines 
        can be trained.
        
        '''
        # Define logging directory. Monitoring data and agent model will be stored here
        log_dir = os.path.join(utilities.get_root_path(), 'examples', 'agents',
                               'monitored_A2C')

        # Perform a short training example with callback
        env, _, _ = run_save_callback.train_A2C_with_callback(
            log_dir=log_dir, tensorboard_log=None)

        # Load the trained agent
        model = A2C.load(os.path.join(log_dir, 'best_model'))

        # Test one step with the trained model
        obs = env.reset()
        df = pd.DataFrame([model.predict(obs)[0][0]], columns=['value'])
        df.index.name = 'keys'
        ref_filepath = os.path.join(utilities.get_root_path(), 'testing',
                                    'references', 'save_callback.csv')
        self.compare_ref_values_df(df, ref_filepath)

        # Remove model to prove further testing
        shutil.rmtree(log_dir, ignore_errors=True)
Beispiel #26
0
def load(config, agent, epoch, from_disk=True):
    config = config['ai']
    if not config['enabled']:
        logging.info("ai disabled")
        return False

    logging.info("[ai] bootstrapping dependencies ...")

    from stable_baselines import A2C
    from stable_baselines.common.policies import MlpLstmPolicy
    from stable_baselines.common.vec_env import DummyVecEnv

    import pwnagotchi.ai.gym as wrappers

    env = wrappers.Environment(agent, epoch)
    env = DummyVecEnv([lambda: env])

    logging.info("[ai] bootstrapping model ...")

    a2c = A2C(MlpLstmPolicy, env, **config['params'])

    if from_disk and os.path.exists(config['path']):
        logging.info("[ai] loading %s ..." % config['path'])
        a2c.load(config['path'], env)
    else:
        logging.info("[ai] model created:")
        for key, value in config['params'].items():
            logging.info("      %s: %s" % (key, value))

    return a2c
def load_a2c():
    loaded_model = A2C.load(save_dir + "/A2C_tutorial")
    print("loaded", loaded_model.predict(obs, deterministic=True))
    print("load gamma=", loaded_model.gamma, ", n_steps=", loaded_model.n_steps)
    # 模型保存模型的超参数和网络参数, 但不保存环境 env. 在 load 模型后需要重新设置环境.
    loaded_model.set_env(DummyVecEnv([lambda: gym.make("Pendulum-v0")]))
    loaded_model.learn(8000)
def test_monitor():
    env = gym.make('Pendulum-v0')
    env = Monitor(gym.make('Pendulum-v0'), filename=None, allow_early_resets=True)
    normalized_env = NormalizeActionWrapper(env)
    normalized_env = DummyVecEnv([lambda: normalized_env])
    # model
    model_2 = A2C('MlpPolicy', normalized_env, verbose=1).learn(1000)
Beispiel #29
0
def NewPotential(current_window, algorithm='PPO'):

    # Determine the pretrained agent
    if algorithm == 'A2C':
        model = A2C.load("pretrained_A2C")
    elif algorithm == 'PPO':
        model = PPO2.load("pretrained_PPO")
    elif algorithm == 'ACKTR':
        model = ACKTR.load("pretrained_ACKTR")
    elif algorithm == 'ACER':
        model = ACER.load("pretrained_ACER")
    else:
        raise ValueError("%s is not a valid algorithm." % algorithm)

    if len(current_window) != model.observation_space.shape[0]:
        raise ValueError("%s is does not match the model's window size." %
                         len(current_window))

    action, _states = model.predict(current_window, deterministic=False)

    voltages = np.linspace(0, 1, num=model.action_space.n)
    if action >= 0 and action <= model.action_space.n - 1:
        voltage = voltages[action]
    else:
        raise ValueError(
            "Received invalid action={} which is not part of the action space".
            format(action))

    return voltage
Beispiel #30
0
def a2c(env, seed):
    return A2C('MlpPolicy',
               env,
               learning_rate=0.001,
               verbose=1,
               tensorboard_log="./data/runs",
               seed=seed)