コード例 #1
0
def _make_agents(env, model_dir, agent_name, op_name=None):
    # Models
    if agent_name is None:
        model = SimpleRuleBasedAgent(env)
    else:
        model = DQN.load(model_dir + agent_name)

    if op_name is None:
        op = SimpleRuleBasedAgent(env)
    else:
        op = DQN.load(model_dir + op_name)
    return model, op
コード例 #2
0
def basic_usage_example():
    # Basic Usage: Training, Saving, Loading.

    # Create environment.
    env = gym.make("LunarLander-v2")

    # Instantiate the agent.
    model = DQN("MlpPolicy", env, verbose=1)
    # Train the agent.
    model.learn(total_timesteps=int(2e5))
    # Save the agent.
    model.save("dqn_lunar")
    del model  # Delete trained model to demonstrate loading.

    # Load the trained agent.
    # NOTE: if you have loading issue, you can pass 'print_system_info=True'
    # to compare the system on which the model was trained vs the current one.
    #model = DQN.load("dqn_lunar", env=env, print_system_info=True)
    model = DQN.load("dqn_lunar", env=env)

    # Evaluate the agent.
    # NOTE: If you use wrappers with your environment that modify rewards,
    #	this will be reflected here. To evaluate with original rewards,
    #	wrap environment in a "Monitor" wrapper before other wrappers.
    mean_reward, std_reward = evaluate_policy(model,
                                              model.get_env(),
                                              n_eval_episodes=10)

    # Enjoy trained agent.
    obs = env.reset()
    for i in range(1000):
        action, _states = model.predict(obs, deterministic=True)
        obs, rewards, dones, info = env.step(action)
        env.render()
コード例 #3
0
def run_experiment(env, experiment_id, episodes=EPISODES, visualise=False):
    evaluation_results, histories = [], []
    models = [
        NoneAgent(env),
        RandomAgent(env),
        HeuristicAgent(env),
        DQN.load(MODEL_PATH)
    ]
    model_names = ['baseline_1', 'baseline_2', 'baseline_3', 'dqn_model']

    env.experiment = experiment_id

    for model, model_id in zip(models, model_names):
        print(f'EXPERIMENT: {experiment_id}, MODEL: {model_id}')

        evaluation_result, history = run_evaluation(env, model, episodes,
                                                    visualise)

        evaluation_result['model'] = model_id
        evaluation_result['experiment'] = experiment_id
        evaluation_results.append(evaluation_result)

        history['model'] = model_id
        history['experiment'] = experiment_id
        histories.append(history)

    return pd.concat(evaluation_results), pd.concat(histories)
コード例 #4
0
def main(save_video=False,
         num_eps=1,
         render=False,
         attack=None,
         save_perturbed_img=False):
    pong_duel.AGENT_COLORS[1] = 'red'
    # Initialize environment
    env = gym.make('PongDuel-v0')
    env = RewardZeroToNegativeBiAgentWrapper(env)
    if save_video:
        env = Monitor(env,
                      './output/recordings',
                      video_callable=lambda episode_id: True,
                      force=True)
    # env = RewardZeroToNegativeBiAgentWrapper(env)
    # env = ObservationVectorToImage(env, 'both')
    env = ObserveOpponent(env, 'both')
    env = MAGymCompatibilityWrapper(env, image_observations='none')
    model_dir = '../../output/gcp-models/'

    # Models
    op_name = 'gcp-feature-based-op-obs7.out'

    model = WhiteBoxMonteCarloAgent(env, num_sims=10, sim_max_steps=2000)
    op = DQN.load(model_dir + op_name)
    env.set_opponent(op)
    avg_reward, total_steps = evaluate(model,
                                       env,
                                       attack=attack,
                                       slowness=0.05,
                                       num_eps=num_eps,
                                       render=render,
                                       save_perturbed_img=save_perturbed_img)
    print(avg_reward)
    print(total_steps)
コード例 #5
0
def main():
    policy = DQN.load('experiments/multirun/grid_sweep/2021-03-07/14-13-11/1/ckpts/rl_model_5000000_steps.zip')
    # load the environment using the yaml that was used for training:
    cfg = OmegaConf.load('experiments/multirun/grid_sweep/2021-03-07/14-13-11/1/.hydra/config.yaml')
    env_cfg = cfg.env
    env_cfg.render = True
    env = Expando(**env_cfg)
    obs_0 = env.reset()
    for i in range(10000):
        action_0 = policy.predict(obs_0)[0][0]
        obs_0, reward, done, info = env.step(action_0)
        env.render()
コード例 #6
0
def run_sensitivity():
    model = DQN.load(MODEL_PATH)
    results = []

    for policy in [0.1, 0.25, 0.5, 1, 2]:
        config = {
            'simulation_frequency': 15,
            'demand_amplitude': 15000,
            'total_steps': 100,
            'policy_frequency': policy
        }
        env = gym.make('highway-v0', **config)
        result, history = run_episode(env, model, False)
        results.append(result)

    results_df = pd.DataFrame(results)
    results_df.to_csv(SENSITIVITY_FILENAME)
コード例 #7
0
def test_dqn():
    log_dir = f"model_save/best_model_dqn"
    env = ENV_DISCRETE(istest=True)
    env.render = True
    env = Monitor(env, log_dir)
    model = DQN.load(log_dir)
    plot_results(f"model_save/")
    for i in range(10):
        state = env.reset()
        day = 0
        while True:
            action = model.predict(state)
            next_state, reward, done, info = env.step(action[0])
            state = next_state
            # print("trying:",day,"reward:", reward,"now profit:",env.profit)
            day += 1
            if done:
                print('stock', i, ' total profit=', env.profit, ' buy hold=',
                      env.buy_hold)
                break
コード例 #8
0
def evaluate(params):

    # Load saved model
    model = DQN.load(exp_name, env=env)
    results = np.zeros(shape=(0, 0))
    obs = env.reset()

    # Evaluate the agent
    episode_reward = 0
    for _ in range(params.get("test_episodes")):
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, done, info = env.step(action)
        episode_reward += reward
        if done or info.get('is_success', False):
            episode_reward = 0.0
            obs = env.reset()

        result = ("Reward:", episode_reward, "Success?",
                  info.get('is_success', True))
        results = np.append(results, result, axis=None)
コード例 #9
0
def main(save_video=False, num_eps=1, render=True, attack=None, save_perturbed_img=False):
    # Initialize environment
    env = gym.make('PongDuel-v0')
    if save_video:
        env = Monitor(env, './output/recordings', video_callable=lambda episode_id: True, force=True)
    # env = RewardZeroToNegativeBiAgentWrapper(env)
    env = ObserveOpponent(env, 'both')
    env = MAGymCompatibilityWrapper(env, image_observations='none')
    model_dir = '../../output/gcp-models/'

    # Models
    agent_name = "gcp-feature-based-op-obs7.out"

    victim = DQN.load(model_dir + agent_name)

    adv = WhiteBoxAdversarialAgent(env, victim, victim_type='sb3')

    env.set_opponent(victim)
    avg_reward, _ = evaluate(adv, env, attack=attack, slowness=0.05, num_eps=num_eps, render=render,
                             save_perturbed_img=save_perturbed_img, )
    print(avg_reward)
コード例 #10
0
                screen.fill((0,0,0))
                screen.blit(pygame.surfarray.make_surface(resize(reconstruct[0].transpose(1, 0, 2), 600)), (0, 0))
                pygame.display.flip()
    
                for event in pygame.event.get():
                    if event.type == pygame.QUIT:
                        pygame.quit()
    
    
            # real speed
            state[32] += args.safety

            # abs 1-4
            state[34:38] += args.safety

            # action = np.argmax(model.forward(state))
            action, _states = model.predict(state, deterministic=True)
            state, reward, done, _ = env.step(action)


if __name__ == "__main__":
    args = parser.parse_args([] if "__file__" not in globals() else None)

    env = make_env()()

    if args.load_from is not None:
        print("loading model", args.load_from)
        model = DQN.load(args.load_from)

    evaluate(model, env)
コード例 #11
0
# by frank tian, 2021-1-14

from stable_baselines3 import DQN
import gym_flappy_bird
import gym
import os

env = gym.make("FlappyBird-v0", is_demo=True)
obs = env.reset()

model = DQN.load(os.path.join(os.path.dirname(__file__),
                              'logs/best_model.zip'))

if __name__ == "__main__":
    rewards = 0
    time_steps = 0
    while True:
        # action = env.action_space.sample()
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, done, info = env.step(action)
        rewards += reward
        time_steps += 1
        env.render()

        if done:
            obs = env.reset()
            print("rewards: {}, of {} steps".format(rewards, time_steps))
            rewards = 0
            time_steps = 0
コード例 #12
0
ファイル: ttt_action.py プロジェクト: tpvt99/sbcs5478
env = VecFrameStack(
    env,
    n_stack=custom_params['FRAME_STACK'])  # Use 1 for now because we use image
if not custom_params['USING_VAE']:
    env = VecTransposeImage(env)  # Uncomment if using 3d obs
if custom_params['USING_NORMALIZATION']:
    env = VecNormalize.load(osp.join(results_dir, "vec_normalization.pkl"),
                            env)

# Load the agent
if custom_params['algo'] == 'sac':
    model = SAC.load(osp.join(results_dir, "best_model", "best_model.zip"))
elif custom_params['algo'] == 'a2c':
    model = A2C.load(osp.join(results_dir, "best_model", "best_model.zip"))
elif custom_params['algo'] == 'dqn':
    model = DQN.load(osp.join(results_dir, "best_model", "best_model.zip"))
elif custom_params['algo'] == 'ppo':
    model = PPO.load(osp.join(results_dir, "best_model", "best_model.zip"))

else:
    raise ValueError("Error model")

# Load the saved statistics
#  do not update them at test time
env.training = False
# reward normalization is not needed at test time
env.norm_reward = False

obs = env.reset()
steps = 0
rewards = 0
コード例 #13
0
 def __init__(self):
     self.env = DQNAgent.create_env(1)
     self.model = DQN.load(MODEL_PATH)
コード例 #14
0
ファイル: dqn_sudoku.py プロジェクト: vrundha/sudoku
import sys
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.dqn.policies import MlpPolicy
from stable_baselines3 import DQN
from gym_sudoku.envs.sudoku_env import SudokuEnv

env = SudokuEnv()

if "--train" in sys.argv:
    model = DQN(MlpPolicy, env, verbose=1, learning_starts=100)
    model.learn(total_timesteps=10000)
    model.save("dqn_sudoku")
else:
    model = DQN.load("dqn_sudoku")

obs = env.reset()
env.render()
for _ in range(20):
    action, _states = model.predict(obs, deterministic=True)
    print("Action", action)
    print("States", _states)
    print("Coordinates", env.fill_pointer)
    obs, rewards, done, info = env.step(action)
    env.render()
    if done:
        print("Resetting ==============================================>")
        obs = env.reset()

コード例 #15
0
ファイル: sb_agent.py プロジェクト: MoMe36/TradingAgent
            verbose = 1, device = torch.device('cpu'), 
            tensorboard_log = './runs/')
        else: 
            model = PPO('MlpPolicy', make_vec_env('Trading-v2', 8), 
                verbose = 1, device = torch.device('cpu'), 
                tensorboard_log = './runs/')
        
        model.learn(total_timesteps = 20e6, 
                    tb_log_name = args.name, 
                    callback = CheckpointCallback(save_freq = 10000, save_path = "./trained_models", 
                                                  name_prefix = args.name))
        model.save('{}_trading_sb'.format('dqn' if args.dqn else 'ppo'))
    else: 
        print('Loading agent')
        if(args.dqn):
            model = DQN.load('dqn_trading_sb') 
        else: 
            model = PPO.load('ppo_trading_sb')
    # model = PPO('MlpPolicy', env, verbose = 1)


    eval_eps = 100
    pbar = tqdm(total = eval_eps)
    env = gym.make('Trading-v0')
    rewards = []
    baseline_diff = []
    for ep in range(eval_eps): 
        done = False 
        ep_reward = 0
        s = env.reset()
        while not done: 
コード例 #16
0
        self.linear = nn.Sequential(nn.Linear(n_flatten, features_dim),
                                    nn.ReLU())

    def forward(self, observations: th.Tensor) -> th.Tensor:
        return self.linear(self.cnn(observations))


policy_kwargs = dict(
    features_extractor_class=CustomCNN,
    features_extractor_kwargs=dict(features_dim=128),
)

if os.path.isfile(agentPath):
    print(f"Load agent from {agentPath}")
    # model = PPO.load(agentPath)
    model = DQN.load(agentPath)
    model.set_env(env)
else:
    print(f"Instanciate new agent and save in {agentPath}")
    # model = PPO("CnnPolicy", env_vec, policy_kwargs=policy_kwargs, verbose=1)
    # model = DQN("CnnPolicy", env_vec, policy_kwargs=policy_kwargs, verbose=1)
    model = DQN("CnnPolicy",
                env,
                target_update_interval=1000,
                batch_size=512,
                exploration_final_eps=0.2,
                policy_kwargs=policy_kwargs,
                verbose=1)
    model.save(agentPath)

# Record gif of trained agent
コード例 #17
0
        model.learn(total_timesteps=training_timesteps,
                    log_interval=1,
                    callback=[callback])

        npz = np.load(train_log_dir + '/log.npz')
        df = pd.DataFrame.from_dict({item: npz[item] for item in npz.files})
        print(
            'Train Profit Factor:',
            df['Index'].loc[(df['State'] != 'Flat')
                            & (df['Profit'] > 0)].count() /
            df['Index'].loc[(df['State'] != 'Flat')
                            & (df['Profit'] < 0)].count())
        df.to_csv(train_log_dir + '/Train_log.csv')

    if TEST:
        model = DDQN.load(log_dir + '/best_model/' + MODEL_NAME)
        env = gym.make('rl_stocks-v0')
        env._reset(actions=N_DISCRETE_ACTIONS,
                   observation_space=OBSERVATION_SPACE_TEST,
                   data=test_df,
                   trade_amount=TRADE_AMOUNT,
                   key=KEY,
                   wallet=WALLET,
                   window=WINDOW,
                   interest_rate=INTEREST_RATE,
                   log_dir=test_log_dir)

        for step in range(testing_timesteps):
            obs = env.reset()
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, info = env.step(action)
コード例 #18
0
            model.save(model_name)

        if ALGORITHM == 'PPO':
            from stable_baselines3.ppo import MlpPolicy
            model = PPO(MlpPolicy, env, tensorboard_log=log_dir, verbose=2)
            model.learn(total_timesteps=NUM_EPISODES * MAX_STEPS,
                        tb_log_name=log_name,
                        log_interval=1)
            model.save(model_name)

        else:
            print('!ERROR: incorrect algorithm selection!')

        del model

    else:
        model_name = '02-08-2021_' + ALGORITHM + '_scarecrow'

        if ALGORITHM == 'DQN':
            model = DQN.load(model_name)
        if ALGORITHM == 'PPO':
            model = PPO.load(model_name)

        obs = env.reset()
        while True:
            action, _states = model.predict(obs, deterministic=True)
            obs, reward, done, info = env.step(action)
            env.render()
            if done:
                obs = env.reset()
コード例 #19
0
env = DummyVecEnv([
    lambda: Monitor(
        gym.make(
            "airgym:airsim-drone-sample-v0",
            ip_address="127.0.0.1",
            step_length=1,
            image_shape=(84, 84, 1),
            destination=np.array([300, 0, -40]),
        ))
])

# Wrap env as VecTransposeImage to allow SB to handle frame observations
env = VecTransposeImage(env)

model = DQN.load("model/dqn_airsim_drone_policy_4actions_30000_steps_cont.zip",
                 env=env)

# Initialize RL algorithm type and parameters
# model = DQN(
#     "CnnPolicy",
#     env,
#     learning_rate=0.00025,
#     verbose=1,
#     batch_size=32,
#     train_freq=4,
#     target_update_interval=200,
#     learning_starts=200,
#     buffer_size=10000,
#     max_grad_norm=10,
#     exploration_fraction=0.1,
#     exploration_final_eps=0.01,
コード例 #20
0
ファイル: main_rl.py プロジェクト: agunter7/CPEN513_Project
def key_handler(event):
    """
    Accepts a key event and makes an appropriate decision.
    :param event: Key event
    :return: void
    """
    global _root
    global _routing_canvas
    global _rl_model
    global _is_first_step
    global _rl_env
    global _rl_target_cell
    global _step_count
    global LEARN_RATE
    global EXPLORE_INIT
    global EXPLORE_FINAL
    global GAMMA
    global TRAIN_TIME_STEPS
    global LOAD_MODEL_NAME

    e_char = event.char

    if e_char == 'l':
        # RL Agent Learning pass

        # AI Gym Environment check - only do this when testing a new environment (resets RNG seed)
        # check_env(_rl_env)
        _step_count = 0  # Reset because check_env increments via step()

        # RL Agent
        _rl_model = DQN('MlpPolicy', _rl_env, verbose=1, learning_rate=LEARN_RATE, exploration_initial_eps=EXPLORE_INIT,
                        exploration_final_eps=EXPLORE_FINAL, gamma=GAMMA)
        print("Beginning RL training")
        _rl_model.learn(total_timesteps=TRAIN_TIME_STEPS)
        print("Finished RL training")
        print("Saving trained model")
        _rl_model.save("agent_" + time.strftime("%d-%m-%YT%H-%M-%S"))
    elif e_char == 't':
        # RL Agent Testing pass

        # AI Gym Environment check - only do this when testing a new environment (resets RNG seed)
        # check_env(_rl_env)
        _step_count = 0  # Reset because check_env increments via step()

        print("Loading trained model")
        if _rl_model is None:
            _rl_model = DQN.load(LOAD_MODEL_NAME)

        obs = _rl_env.reset()
        done = False
        while not done:
            rl_action, states = _rl_model.predict(obs, deterministic=True)
            print("Action " + str(rl_action))
            obs, rewards, done, info = _rl_env.step(rl_action)
    elif e_char == 'r':
        # RL flow debugging (no agent involved, emulate actions randomly)
        if _is_first_step:
            _rl_env.reset()
            _is_first_step = False
        else:
            rand_action = random.randrange(1)
            rl_action_step(rand_action)
    else:
        pass
コード例 #21
0
import numpy as np
import gym
import gym_fishing
from stable_baselines3 import DQN
from stable_baselines3.common.env_checker import check_env

env = gym.make('fishing-v0')

check_env(env)

model = DQN('MlpPolicy', env, verbose=1)
model.learn(total_timesteps=200)

## Simulate a run with the trained model, visualize result
df = env.simulate(model)
env.plot(df, "dqn.png")

## Evaluate model
from stable_baselines3.common.evaluation import evaluate_policy
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=5)
print("mean reward:", mean_reward, "std:", std_reward)

## Save and reload the model
model.save("dqn")
model = DQN.load("dqn")
コード例 #22
0
from stable_baselines3 import DQN, PPO, A2C
from stable_baselines3.common.cmd_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy

# Instantiate the env
env = ABCEnv()
# wrap it
env = make_vec_env(lambda: env, n_envs=1)

# Train the agent
"""
Something you might want to play around with, learning_rate, total timesteps etc.. 
Always choose a sample efficient algorithm
"""
total_timesteps = 200
model = DQN('MlpPolicy',
            env,
            verbose=1,
            tensorboard_log="./CSC2547_tensorboard/")
model.learn(total_timesteps)

model_name = "DQN_timesteps_" + str(total_timesteps)
model.save(model_name)

model.load(model_name, env=env)
mean_reward, std_reward = evaluate_policy(model,
                                          model.get_env(),
                                          n_eval_episodes=2)
print("mean_reward is: ", mean_reward)
print("std_reward is: ", std_reward)
コード例 #23
0
    new = 1
    load = 0
    test = 0
    if new + load + test != 1:
        raise Exception('Initialize new, train or load a model')

    dm, y_oracle = init_dm(CONFIG)
    print(dm)
    env = ClassificationEnv(dm, y_oracle)

    sys.path.insert(0, 'dral')
    if new:
        model = DQN(CnnPolicy, env, verbose=1, learning_rate=2e-4,
                    gamma=0.98, batch_size=32, learning_starts=3000)
    if load:
        model = DQN.load("data/rl_query_rps.pth")
    if test:
        model = init_and_train_rl_classification_model(
            timesteps=100000, path='data/rl_query_dogs_cats.pth')

    # show_grid_imgs(dm.test.get_x(list(range(9))), dm.test.get_y(list(range(9))), (3, 3))
    n_episodes = 5
    for k in range(n_episodes):

        # label images
        y_oracle = label_samples(dm, y_oracle, n=100, random=True)
        dm.train.shuffle()
        print(dm)

        model.learn(total_timesteps=6000, log_interval=30)
コード例 #24
0
    lambda: Monitor(
        gym.make(
            "airgym:airsim-drone-sample-v0",
            ip_address="127.0.0.1",
            step_length=1,
            image_shape=(84, 84, 1),
            destination=np.array([70, -5, -20]),
        ))
])

# Wrap env as VecTransposeImage to allow SB to handle frame observations
env = VecTransposeImage(env)

# model = DQN.load("model/dqn_airsim_drone_policy")
model = DQN.load(
    "checkpoint/v18_dqn_cnnPolicy_4actions_imageObs_100000_steps/dqn_policy_65000_steps.zip"
)

mean_reward, std_reward = evaluate_policy(model,
                                          env,
                                          n_eval_episodes=1,
                                          deterministic=True)

print(f"mean_reward = [{mean_reward:.2f}] +/- {std_reward}")

# obs = env.reset()
# while True:
#     action, _states = model.predict(obs, deterministic=True)
#     obs, reward, done, info = env.step(action)
#     if done:
#       obs = env.reset()
コード例 #25
0
ファイル: dqn_agent.py プロジェクト: lolotrgeek/task-learner
print('Envrionment Setup...')
env = gym.make('Desktop-v0', debug=False, show=True, steplimit=100)
outdir = '/tmp/random-agent-results'
env = Monitor(env, directory=outdir, force=True)
episodes = 10
# Setup Agent
print('Agent Setup...')
model = DQN(MlpPolicy, env, verbose=0, buffer_size=500)
print('Returning Trained Model...')
model.learn(total_timesteps=1000, log_interval=4)
print('Saving Trained Model...')
model.save("deepq_desktop")

del model  # remove to demonstrate saving and loading
print('Loading Trained Model...')
model = DQN.load("deepq_desktop")


def unique_reward(last_state, current_state):
    # rewards a current state that is different from the last state
    return (np.sum(last_state) - np.sum(current_state))


if __name__ == '__main__':
    try:
        print('Running Environment')
        last_state = None
        # Run Environment
        for episode in range(episodes):
            print('Episode:', episode)
            obs = env.reset()
コード例 #26
0
def learn_with_selfplay(max_agents,
                        num_learn_steps,
                        num_learn_steps_pre_training,
                        num_eval_eps,
                        num_skip_steps=0,
                        model_name='dqn',
                        only_rule_based_op=False,
                        patience=5,
                        image_observations=True,
                        output_folder="output",
                        fine_tune_on=None,
                        opponent_pred_obs=False,
                        adversarial_training=None,
                        save_freq=None):
    """
    Train an agent with regular self-play. If there are checkpoints of previous training continue training with the checkpoints.

    :param max_agents: Stop after max_agents intermediate agents have been trained. An intermediate agent is saved when training
    successfully created an improved agent.
    :param num_learn_steps: Number of frames / steps for every learning iteration
    :param num_learn_steps_pre_training: Number of frames / steps for pre-training on the rule-based agent
    :param num_eval_eps: Number of episodes for intermediate evaluation. Intermediate evaluation determines whether trained agent improved
    compared to previous version
    :param num_skip_steps: Skip num_skip_steps frames performing the action from the previous step
    :param model_name: Name for saving the model. If there are already checkpoints with this name training is continued. Checkpoints will be
    saved as madel_namei, where i is the training iteration.
    :param only_rule_based_op: If set to true training is only performed on the rule-based agent.
    :param patience: Patience parameter for evaluation
    :param image_observations: Use image instead of feature observations
    :param output_folder: Root folder for outputs
    :param fine_tune_on: If not None instead of self-play training perform training of an adversarial policy against the victim specified as
    string to this parameter
    :param opponent_pred_obs:
        If this is set to True, the predictions of the opponents in the current state will beconcatenated to the observations for the main
        agent. This was an attempt to create a stronger adversarial policy, which could use this information, however in our experiments
        this didn't improve the adversarial policy
    :param adversarial_training: If set to True perform adversarial training using FGSM during training.
    :param save_freq: If not None save intermediate checkpoints during training with the given frequency
    :return:
    """
    eval_env, eval_env_rule_based, eval_op, train_env, train_env_rule_based = _init_envs(image_observations,
                                                                                         num_skip_steps,
                                                                                         opponent_pred_obs,
                                                                                         adversarial_training)

    # If fine tuning, load model to fine-tune from path
    if fine_tune_on is not None:
        path = Path(output_folder) / 'models' / fine_tune_on
        fine_tune_model = DQN.load(path)
        fine_tune_model.tensorboard_log = None
        if opponent_pred_obs:
            # We can't eval on agents that don't have a q_net so we change eval_op to the original model that is being
            # fine-tuned against, instead of the rule-based agent
            eval_op = fine_tune_model
            eval_env_rule_based.set_opponent(eval_op)
            eval_env_rule_based = OpponentPredictionObs(eval_env_rule_based)
            eval_env.set_opponent(eval_op)
            eval_env = OpponentPredictionObs(eval_env)
    else:
        fine_tune_model = None

    # Initialize first agent
    pre_train_agent = SimpleRuleBasedAgent(train_env_rule_based)
    previous_models = [pre_train_agent]

    # Load potentially saved previous models
    for opponent_id in range(1, max_agents):
        path = _make_model_path(output_folder, model_name, opponent_id)
        if os.path.isfile(path):
            model = DQN.load(path)
            previous_models.append(model)
        else:
            break

    # Initialize first round
    last_agent_id = len(previous_models) - 1
    prev_num_steps = 0
    patience_counter = 0
    tb_path = Path(output_folder) / "tb-log"
    if last_agent_id == 0:
        # main_model = A2C('MlpPolicy', policy_kwargs=dict(optimizer_class=RMSpropTFLike, optimizer_kwargs=dict(eps=1e-5)), env=train_env, verbose=0,
        #                 tensorboard_log="output/tb-log")
        # main_model = A2C('MlpPolicy', train_env, verbose=0, tensorboard_log="output/tb-log")  # , exploration_fraction=0.3)
        main_model = DQN('MlpPolicy', train_env_rule_based, verbose=0, tensorboard_log=tb_path)  # , exploration_fraction=0.3)
    else:
        main_model = copy.deepcopy(previous_models[last_agent_id])
        main_model.set_env(train_env)
        main_model.tensorboard_log = tb_path

    # Start training with self-play over several rounds
    opponent_id = last_agent_id
    while opponent_id < max_agents - 1:
        print(f"Running training round {opponent_id + 1}")
        if fine_tune_on is None:
            # Choose opponent based on setting
            if only_rule_based_op:
                current_train_env = train_env_rule_based
                # Use rule-based as opponent
                current_train_env.set_opponent(SimpleRuleBasedAgent(current_train_env))
            else:
                if opponent_id == 0:
                    current_train_env = train_env_rule_based
                else:
                    current_train_env = train_env
                # Take opponent from the previous version of the model
                current_train_env.set_opponent(previous_models[opponent_id])
        else:  # Use passed fine-tune agent as opponent
            current_train_env = train_env
            current_train_env.set_opponent(fine_tune_model)

        # Train the model
        current_train_env.set_opponent_right_side(True)

        chosen_n_steps = num_learn_steps_pre_training if opponent_id == 0 else num_learn_steps  # Iteration 0 is pre-training

        # In order to generate adversarial examples the adversarial training wrapper needs a references to the model that is
        # currently being trained
        if adversarial_training is not None:
            current_train_env.env.victim_model = main_model

        # Optionally add a callback to save intermediate checkpoints
        if save_freq is not None:
            checkpoint_callback = CheckpointCallback(save_freq=save_freq,
                                                     save_path='./output/intermediate/',
                                                     name_prefix=model_name + str(opponent_id + 1) + '_interm')
        else:
            checkpoint_callback = None

        # === LEARNING ===
        main_model.learn(total_timesteps=chosen_n_steps, tb_log_name=model_name, callback=checkpoint_callback)

        # Do evaluation for this training round
        eval_env_rule_based.set_opponent(eval_op)
        avg_round_reward, num_steps = evaluate(main_model, eval_env_rule_based, num_eps=num_eval_eps)
        print(model_name)
        print(f"Average round reward after training: {avg_round_reward}")
        print(f"Average number of steps per episode: {num_steps / num_eval_eps}")

        # Check if there was improvement
        if num_steps > prev_num_steps:  # Model improved compared to last
            print('Model improved')
            prev_num_steps = num_steps
            # Reset patience counter
            patience_counter = 0

            # Save the further trained model to disk
            main_model.save(_make_model_path(output_folder, model_name, opponent_id + 1))
            # Make a copy of the just saved model by loading it
            copy_of_model = DQN.load(_make_model_path(output_folder, model_name, opponent_id + 1))
            # Save the copy to the list
            previous_models.append(copy_of_model)

            # From here we continue training the same main_model against itself
            opponent_id += 1
        else:
            print('Model did not improve')
            patience_counter += 1
            # Do not save the model
            if patience_counter > patience:
                print('Stopping early due to patience')
                break
            # Because our model did not improve compared to the previous one, we reset our main_model to the previous one
            main_model = DQN.load(_make_model_path(output_folder, model_name, opponent_id))
            main_model.set_env(train_env)

            # Opponent does not change

    if not opponent_pred_obs:
        # Evaluate the last model against each of its previous iterations
        # evaluate_against_predecessors(previous_models, env_rule_based=eval_env_rule_based, env_normal=eval_env, num_eval_eps=num_eval_eps)
        pass  # Not useful right now
コード例 #27
0
ファイル: DRL_battery.py プロジェクト: jajimer/energym
    else:
        model_path = ''
        if 'gs://' in args.model:
            # Download from given bucket (gcloud configured with privileges)
            client = gcloud.init_storage_client()
            bucket_name = args.model.split('/')[2]
            model_path = args.model.split(bucket_name + '/')[-1]
            gcloud.read_from_bucket(client, bucket_name, model_path)
            model_path = './' + model_path
        else:
            model_path = args.model

        model = None
        if args.algorithm == 'DQN':
            model = DQN.load(model_path, tensorboard_log=args.tensorboard)
        elif args.algorithm == 'DDPG':
            model = DDPG.load(model_path, tensorboard_log=args.tensorboard)
        elif args.algorithm == 'A2C':
            model = A2C.load(model_path, tensorboard_log=args.tensorboard)
        elif args.algorithm == 'PPO':
            model = PPO.load(model_path, tensorboard_log=args.tensorboard)
        elif args.algorithm == 'SAC':
            model = SAC.load(model_path, tensorboard_log=args.tensorboard)
        elif args.algorithm == 'TD3':
            model = TD3.load(model_path, tensorboard_log=args.tensorboard)
        else:
            raise RuntimeError('Algorithm specified is not registered.')

        model.set_env(env)
コード例 #28
0
ファイル: cupsworld_dqn.py プロジェクト: spotter-ai/spotter
        model = PPO('MlpPolicy', env=env, verbose=1)

    model.learn(total_timesteps=timesteps)
    model.save("model_cups")


def act(env, model):
    # env is deterministic as in if I say "go right" the gripper will go right all the time.
    obs = env.reset()
    for i in range(100):
        env.render()
        action, _states = model.predict(obs, deterministic=True)
        # print(action)
        obs, reward, done, info = env.step(action)
        if done:
            print('[FINAL] obs=', obs, 'reward=', reward, 'done=', done)
            break


type = "DQN"
TIME_STEPS = 50000
env = gym.make('CupsWorld-v0')
# train(env, type, TIME_STEPS)
if type == "A2C":
    model = A2C.load('model_cups')
elif type == "DQN":
    model = DQN.load('model_cups')
elif type == "PPO":
    model = PPO.load('model_cups')
act(env, model)
env.close()
コード例 #29
0
    #env_eval = Monitor(env, './logs/')

    eval_callback = EvalCallback(env,
                                 best_model_save_path='./logs/',
                                 log_path='./logs/',
                                 eval_freq=1000,
                                 deterministic=True,
                                 render=False)

    #Deeper NN
    #model = DQN.load("DQN", env=env)
    model.learn(total_timesteps=5_000_000,
                callback=eval_callback)  # Typically not enough
    model.save("DQN")
    #model = DQN.load("DQN", env=env)
    model = DQN.load("logs/best_model", env=env)
    #model = PPO.load("PPO_discrete", env=env)

    logger = Logger(logging_freq_hz=int(env.SIM_FREQ / env.AGGR_PHY_STEPS),
                    num_drones=ARGS.num_drones)
    obs = env.reset()
    start = time.time()
    n_trial = 0
    for i in range(ARGS.duration_sec * env.SIM_FREQ):
        if ARGS.duration_sec * env.SIM_FREQ % AGGR_PHY_STEPS == 0:
            action, _states = model.predict(
                obs,
                deterministic=True,
            )
            #else:
            #    action = np.array([1,0,0]) #No Turn
コード例 #30
0
        },
        "policy_frequency": 2,
        "duration": 40,
    })
    env.reset()
    model = DQN('CnnPolicy', env,
                gamma=0.8,
                learning_rate=5e-4,
                buffer_size=40*1000,
                learning_starts=200,
                exploration_fraction=0.6,
                target_update_interval=256,
                batch_size=32,
                verbose=1,
                tensorboard_log="logs/")
    model.learn(total_timesteps=int(2e5))
    model.save("dqn_highway")

    # Record video
    model = DQN.load("dqn_highway")
    env.configure({"policy_frequency": 15, "duration": 20 * 15})
    video_length = 2 * env.config["duration"]
    env = VecVideoRecorder(env, "videos/",
                           record_video_trigger=lambda x: x == 0, video_length=video_length,
                           name_prefix="dqn-agent")
    obs = env.reset()
    for _ in range(video_length + 1):
        action, _ = model.predict(obs)
        obs, _, _, _ = env.step(action)
    env.close()