def LunarLander_v2_DQN(): #TODO : 报错
    # Create environment
    env = gym.make('LunarLander-v2')

    # Instantiate the agent
    model = DQN('MlpPolicy', env, learning_rate=1e-3, prioritized_replay=True, verbose=1)
    # Train the agent
    model.learn(total_timesteps=100000)
    # Save the agent
    model.save("dqn_lunar")
    del model  # delete trained model to demonstrate loading

    # Load the trained agent
    model = DQN.load("dqn_lunar")

    # Evaluate the agent
    mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
    print(mean_reward, std_reward)

    # Enjoy trained agent
    obs = env.reset()
    for i in range(1000):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        env.render()
Example #2
0
    def run_train(self):
        env = CustomEnv(self.path_planner, self.behavior_planner, event)
        env = make_vec_env(lambda: env, n_envs=1)
        model = None
        if self.event == Scenario.LANE_CHANGE:
            model = DQN(CustomLaneChangePolicy,
                        env,
                        verbose=1,
                        learning_starts=256,
                        batch_size=256,
                        exploration_fraction=0.9,
                        target_network_update_freq=100,
                        tensorboard_log=dir_path + '/Logs/')

        if self.event == Scenario.PEDESTRIAN:
            model = DQN(CustomPedestrianPolicy,
                        env,
                        verbose=1,
                        learning_starts=256,
                        batch_size=256,
                        exploration_fraction=0.9,
                        target_network_update_freq=100,
                        tensorboard_log=dir_path + '/Logs/Ped',
                        gamma=0.93,
                        learning_rate=0.0001)
        model.learn(total_timesteps=20000)
        model.save(MODEL_SAVE_PATH)
Example #3
0
def train(log_dir, model_dir, env_name, train_timesteps=2500):
    #make sure dir exists
    os.makedirs(log_dir, exist_ok=True)

    os.makedirs(model_dir, exist_ok=True)

    # Create and wrap the environment
    env = gym.make(env_name)

    # Logs will be saved in log_dir/monitor.csv
    env = Monitor(env, log_dir, allow_early_resets=True)
    env = DummyVecEnv([lambda: env])

    model = DQN(MlpPolicy, env, verbose=1)

    # Train the agent
    model.learn(total_timesteps=train_timesteps)
    
    # Save the agent
    if not model_dir.endswith("/"):
        model_dir += "/"

    model.save(str(model_dir) + "dqn_" + str(env_name) + "_trained_timesteps_" + str(train_timesteps))
    # delete trained model
    del model
def train():

    # Load Model

    env = gym.make('roundabout-v0')

    model = DQN(MlpPolicy, env, verbose=1)
    generate_expert_traj(model, 'expert_roundabout', n_timesteps=1000, n_episodes=10)

    #Data Augmentation
    expert_data = dict(np.load('expert_roundabout.npz'))
    print("my keys are:" + str(expert_data.keys()))
    obs = expert_data['obs']
    obs.shape
    expert_data['obs'] = obs.ravel()  # convert to 1D array
    print("my keys are:" + str(expert_data.keys()))
    np.savez('expert_roundabout.npz', expert_data)

    dataset = ExpertDataset(expert_path='expert_roundabout.npz', traj_limitation=10, verbose=1)
    model = GAIL('MlpPolicy', env, dataset, verbose=1)
    model.learn(total_timesteps=1000)
    model.save("gail_roundabout")

    env.close()
    del env
Example #5
0
def train_multiple(cfg, version, trained_model, double_agent=False):
    # double_agent refers to both agents having learned in multi environment
    if double_agent:
        gym_wrapper = MultiAgentCustomEnv(cfg)
        # model_trained = DQN.load("{0}models/{1}".format("./", trained_model), env=gym_wrapper)
        model_trained = DQN.load("{0}models/{1}".format(
            cfg["study_results"], trained_model),
                                 env=gym_wrapper)
    else:
        gym_wrapper = CustomEnv(cfg)
        # model_trained = DQN.load("{0}models/{1}".format("./", trained_model), env=gym_wrapper)
        model_trained = DQN.load("{0}models/{1}".format(
            cfg["study_results"], trained_model),
                                 env=gym_wrapper)

    gym_wrapper = MultiAgentCustomEnv(cfg,
                                      model_trained,
                                      single=not double_agent)

    model = DQN(MlpPolicy,
                gym_wrapper,
                verbose=1,
                double_q=cfg["double-dqn"],
                prioritized_replay=cfg["prioritized"],
                policy_kwargs=dict(dueling=cfg["dueling"]),
                exploration_fraction=cfg["exploration_frac"],
                tensorboard_log=cfg["study_results"] +
                "tensorboard/experiments/")

    model.learn(total_timesteps=cfg["timesteps"],
                tb_log_name=cfg["experiment_name"])
    model.save("{0}models/{2}-v{1}".format(cfg["study_results"], version,
                                           cfg["experiment_name"]))
Example #6
0
def train_DQN(env_train, model_name, timesteps=50000):
    start = time.time()
    model = DQN('MlpPolicy', env_train, verbose=1)
    model.learn(total_timesteps=timesteps)
    end = time.time()

    model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}")
    print('Training time (DQN): ', (end - start) / 60, ' minutes')
    return model
def train():
    env = DummyVecEnv([
        lambda: DemoEnv()
    ])  # DQN does not support parrelization through SubprocVecEnv
    model = DQN(MlpPolicy, env, verbose=1, policy_kwargs={'layers': [4]})
    model.learn(total_timesteps=int(2e5))
    model.save("deepq_DemoEnv")
    env.close()
    del model
Example #8
0
def train():
    machine = StateMachine()
    machine.initialize(headless=True)
    camera = Camera(machine)
    env = CustomEnv(machine, camera, state="vision")
    model = DQN(CnnPolicy, env, verbose=1, learning_starts=32, batch_size=32, \
                exploration_fraction=0.3, target_network_update_freq=32, tensorboard_log=dir_path+'/Logs/')
    model.learn(total_timesteps=1000, log_interval=1000000)
    model.save("Grasp_Model_1")
Example #9
0
def run(model_name, iteration, world, stage):
    world_stage = 'SuperMarioBros-{}-{}-v2'.format(world, stage)
    env = gym_super_mario_bros.make(world_stage)
    env = JoypadSpace(env, RIGHT_ONLY)
    env = WarpFrame(env)
    env = FrameStack(env, n_frames=4)
    env = EpisodicLifeEnv(env)
    # env = MaxAndSkipEnv(env)

    # Save a checkpoint every 1000 steps
    checkpoint_callback = CheckpointCallback(save_freq=5000,
                                             save_path='./logs/',
                                             name_prefix=model_name)

    eval_callback = EvalCallback(env,
                                 best_model_save_path='./logs/',
                                 log_path='./logs/',
                                 eval_freq=10000,
                                 deterministic=True,
                                 render=False)

    print("Compiling model...")
    steps = 10000

    if iteration > 0:
        model = DQN.load('models/{}'.format(model_name),
                         env=env,
                         verbose=1,
                         learning_starts=2500,
                         learning_rate=1e-4,
                         exploration_final_eps=0.01,
                         prioritized_replay=True,
                         prioritized_replay_alpha=0.6,
                         train_freq=4,
                         tensorboard_log="./mario_tensorboard/")
    else:
        model = DQN(CnnPolicy,
                    env,
                    verbose=1,
                    learning_starts=2500,
                    learning_rate=1e-4,
                    exploration_final_eps=0.01,
                    prioritized_replay=True,
                    prioritized_replay_alpha=0.6,
                    train_freq=4,
                    tensorboard_log="./mario_tensorboard/")

    print("Training starting...")
    with ProgressBarManager(steps) as progress_callback:
        model.learn(
            total_timesteps=steps,
            # , eval_callback, checkpoint_callback],
            callback=[progress_callback],
            tb_log_name=model_name)
    print("Finished training model on env...\n")
    model.save("models/{}".format(model_name))
Example #10
0
def launchAgent():
    from stable_baselines import DQN
    import Reinforcement_AI.env.c_seperate_env as sep_env
    from queue import Queue
    from threading import Thread

    minimap_env = sep_env.MinimapEnv()
    allenv = sep_env.AllEnv()

    minimap_model = DQN(
        "CnnPolicy",  # policy
        minimap_env,  # environment
        double_q=True,  # Double Q enable
        prioritized_replay=True,  # Replay buffer enabled
        verbose=0  # log print
    )

    allenv_model = DQN(
        "MlpPolicy",
        allenv,
        double_q=True,
        prioritized_replay=True,
        verbose=0
    )

    for i in range(100):
        if i != 0:
            minimap_model = DQN.load("KR_minimap_" + str(i))
            allenv_model = DQN.load("KR_allenv_" + str(i))

        que = Queue()

        minimap_model.set_env(minimap_env)
        allenv_model.set_env(allenv)

        # minimap_thread = Thread(target=minimap_model.learn, args=[50000])
        # allenv_thread = Thread(target=allenv_model.learn, args=[50000])
        allenv_thread = Thread(target=lambda q, arg1: q.put(allenv_model.learn(arg1)), args=(que, 50000))
        # test = Pool(processes=1)

        # minimap_thread.start()
        allenv_thread.start()
        # test_result = test.apply_async(allenv_model.learn, (50000, None, 100, "DQN", True, None))
        minimap_model.learn(total_timesteps=50000)

        # allenv_model.learn(total_timesteps=50000)

        # minimap_thread.join()
        allenv_thread.join()

        allenv_model = que.get()
        # return_val = test_result.get()

        minimap_model.save("KR_minimap_" + str(i + 1))
        allenv_model.save("KR_allenv_" + str(i + 1))
Example #11
0
def train_dqn(timesteps, name):
    env = datares_roulette
    env = DummyVecEnv([env])
    model = DQN(
        stable_baselines.deepq.policies.MlpPolicy,
        env,
        verbose=1,
    )
    model.learn(total_timesteps=timesteps)
    model.save(name)
    return model
Example #12
0
def main(log_dir=None, name_results_root_folder="results"):
    args = parseArgs()
    time_steps = TIME_STEPS
    # if log_dir doesnt created,use defaul one which contains the starting time of the training.
    if log_dir is None:
        if args.restart_training:
            # find the latest training folder
            latest_log_dir = os.path.join(
                name_results_root_folder,
                sorted(os.listdir(name_results_root_folder))[-1])
            logdir = latest_log_dir
        else:
            defaul_log_dir = os.path.join(name_results_root_folder,
                                          "DQN_" + getTimeStr())
            os.makedirs(defaul_log_dir, exist_ok=True)
            logdir = defaul_log_dir
    else:
        logdir = log_dir
    reward_bound = REWARD_BOUND
    # get arena environments and custom callback
    env = Monitor(Arena2dEnvWrapper(0, True),
                  os.path.join(logdir, "arena_env0"))
    # env = Arena2dEnvWrapper(0, True)
    call_back = SaveOnBestTrainingRewardCallback(500, logdir, 1, reward_bound)
    # set temporary model path, if training was interrupted by the keyboard, the current model parameters will be saved.
    path_temp_model = os.path.join(logdir, "DQN_TEMP")
    if not args.restart_training:
        model = DQN(MlpPolicy,
                    env,
                    gamma=GAMMA,
                    learning_rate=LEARNING_RATE,
                    buffer_size=BUFFER_SIZE,
                    target_network_update_freq=SYNC_TARGET_STEPS,
                    tensorboard_log=logdir,
                    verbose=1)
        reset_num_timesteps = True
    else:
        if os.path.exists(path_temp_model + ".zip"):
            print("continue training the model...")
            model = DQN.load(path_temp_model, env=env)
            reset_num_timesteps = False
        else:
            print(
                "Can't load the model with the path: {}, please check again!".
                format(path_temp_model))
            env.close()
            exit(-1)
    # try:
    model.learn(time_steps,
                log_interval=200,
                callback=call_back,
                reset_num_timesteps=reset_num_timesteps)
    model.save(os.path.join(logdir, "DQN_final"))
Example #13
0
def traindqn(args):
    

    # with tf.device('/device:CUDA:1'):
    with tf.device('/gpu:0'):

        env = gym.make('python_1p-v0')
        # env = Monitor(env, filename=None, allow_early_resets=True)
        env = DummyVecEnv([lambda: env])

        model = DQN(DqnCnnPolicy, env, verbose=1,learning_rate=0.0001, exploration_fraction=0.4, train_freq=10)
        model.learn(5000000)
        model.save("dqnwithcnn.pth")
Example #14
0
def train_agent(agent):
    
    # Get the parameters (the common ones)
    environment_var = get_recipe_config()['environment']
    agent_var = get_recipe_config()['agent']
    policy_var = get_recipe_config()['policy']
    gamma_var = get_recipe_config()['gamma']
    lr_var = get_recipe_config()['dqn_learning_rate']
    training_episodes_var = 5000

    
    # Create the JSON FILE and dump it into the output folder
    training_infos = {
        'name': environment_var,
        'agent': agent_var,
        'type': 'OpenAI Gym',
        'num_episodes': training_episodes_var,
        'lr': lr_var,
        'gamma': gamma_var,
        'policy': policy_var,
        'training_date': str(datetime.datetime.now())
    }
    
    saved_models = dataiku.Folder(get_output_names_for_role('main_output')[0])
    saved_models_info = saved_models.get_info()
    saved_models_path = saved_models.get_path()
    
    with open(saved_models_path + '/training_infos.json', 'w') as fp:
        json.dump(training_infos, fp)

    # Choose the agent
    if agent == "dqn":
        from stable_baselines.common.vec_env import DummyVecEnv
        from stable_baselines.deepq.policies import MlpPolicy
        from stable_baselines.deepq.policies import CnnPolicy
        from stable_baselines import DQN
        
        model = DQN(policy = policy_var, env = environment_var, gamma = gamma_var, learning_rate = lr_var)
        
    # Start the training and dump the model into the output folder
    print("========================== Start Training ==========================")
    model.learn(training_episodes_var)
    model_name = agent_var + "_" + environment_var
    print("Model Saved")
    model.save(saved_models_path + "/" + model_name)
    
    
    
    
    
def run_model(algorithm, training_timesteps, testing_timesteps,
              training_iterations, testing_iterations, learning_rate,
              batch_size):

    model = DQN(CustomPolicy,
                env,
                learning_rate=learning_rate,
                batch_size=batch_size)

    for k in range(training_iterations):
        model.learn(total_timesteps=int(training_timesteps))
        model.save("{}_{}_{}_{}".format("rcrs_wgts", k, algorithm, hostname))
        subprocess.Popen(path_for_kill_file, shell=True)

    for j in range(testing_iterations):
        # Load the trained agent

        model = DQN.load("{}_{}_{}_{}".format("rcrs_wgts", j, algorithm,
                                              hostname))
        # Reset the environment
        obs = env.reset()
        # Create an empty list to store reward values
        final_rewards = []
        for _ in range(testing_timesteps):
            # predict the values
            action, _states = model.predict(obs)
            obs, rewards, dones, info = env.step(action)
            if dones == True:
                final_rewards.append(rewards)
        # Print the mean reward
        print(np.mean(final_rewards))
        # Print the standard deviation of reward
        print(np.std(final_rewards))
        # Create a DataFrame to save the mean and standard deviation
        df = df.append(
            {
                'Mean Rewards': np.mean(final_rewards),
                'Standard deviation': np.std(final_rewards)
            },
            ignore_index=True)

        df.to_csv("{}_{}_{}".format(1,
                                    algorithm,
                                    "MeanAndStdReward.csv",
                                    sep=',',
                                    index=True))

        subprocess.Popen(path_for_kill_file, shell=True)
    subprocess.Popen(path_for_kill_file, shell=True)
class DqnController:
    """
    Implements an RL (DQN) controller
    """
    def __init__(self, env):
        """
        :param: env: a thermostat environment
        """
        self.env = env
        self.model = DQN(MlpPolicy,
                         env,
                         verbose=1,
                         tensorboard_log="./dqn_thermostat_tensorboard/")

    @staticmethod
    def name():
        return "Dqn"

    def train(self):
        self.model.learn(total_timesteps=50000)

    def save(self):
        self.model.save("dqn.pk")

    def load(self):
        self.model = None
        self.model = DQN.load("dqn.pk")

    def simulate(self):
        state = self.env.reset()
        cumulative_reward = 0.0
        P_consumed = []
        done = False
        while not done:
            action, _state = self.model.predict(state)
            state, reward, done, info = self.env.step(action)
            cumulative_reward += reward
            P_consumed.append(action)
        print("MSE Setpoint- realized: %.3f - Energy consumed: %.2f" %
              (cumulative_reward, sum(P_consumed)))
        result_folder = "results/" + self.name(
        ) + "/" + self.env.start_date.strftime(
            "%m-%d-%Y") + "_to_" + self.env.end_date.strftime("%m-%d-%Y")
        self.env.store_and_plot(result_folder)

    def set_env(self, env):
        self.env = env
Example #17
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--algorithm") 
    parser.add_argument("--env")
    parser.add_argument("--steps")
    parser.add_argument("--alpha")
    parser.add_argument("--grid_search")
    args = parser.parse_args()

    algorithm = args.algorithm 
    env = gym.make(args.env)
    grid_search = args.grid_search
    alpha = args.alpha

    if algorithm == "ppo1":
        from stable_baselines import PPO1
        from stable_baselines.common.policies import MlpPolicy
        
        model = PPO1(MlpPolicy, env, verbose=1)
    else:
        from stable_baselines import DQN
        from stable_baselines.deepq.policies import MlpPolicy

        model = DQN(MlpPolicy, env, learning_rate=alpha, verbose=1)

    model.learn(total_timesteps=int(args.steps), log_interval=10)
    model.save(f"{algorithm}_cartpole")

    del model # remove to demonstrate saving and loading

    if algorithm == "ppo1":
        model = PPO1.load(f"{algorithm}_cartpole")
    else:
        model = DQN.load(f"{algorithm}_cartpole")

    mean_reward = evaluate(model, env, num_steps=10000)
    
    hparams_str = f" algorithm={algorithm} env={args.env} steps={args.steps} alpha={alpha}"

    if grid_search:
        with open("grid_search_results.txt", "a") as myfile:
            myfile.write(str(mean_reward) + hparams_str)

        myfile.close()
    else:
        print(str(mean_reward) + hparams_str)
def trainAgent(env):
    model = DQN(
        env=env,
        policy=MlpPolicy,
        verbose=1,
        learning_rate=
        0.05,  # alpha: If your learning rate is set too low, training will progress very slowly as you are making very tiny updates to the weights in your network. However, if your learning rate is set too high, it can cause undesirable divergent behavior in your loss function.
        gamma=
        0.95,  # It controls the importance of the future rewards versus the immediate ones.
        exploration_initial_eps=1.0,
        exploration_fraction=0.9,
        exploration_final_eps=0.01,
        buffer_size=56,
        batch_size=50)
    model.learn(total_timesteps=700)
    model.save('./trained-agents/C1')
    print('Modelo treinado e salvo.')
Example #19
0
def train(params):

    # setup config
    if params.get("policy") == 'mlp':
        policy = MlpPolicy
        env = gym.make(params.get("environment"))
    else:
        policy = CnnPolicy
        env = gym.make(params.get("environment"))
        env.configure(CNN_config)
        env.reset()

    exp_name = ("{0}_{1}_{2}".format(params.get("model_name"),
                                     params.get("policy"),
                                     params.get("environment")))

    log_dir = './logs/' + exp_name

    # create model
    model = DQN(
        policy,
        env,
        verbose=1,
        tensorboard_log=log_dir,
        buffer_size=params.get("buffer_size"),
        learning_rate=params.get("learning_rate"),
        gamma=params.get("gamma"),
        target_network_update_freq=params.get("target_update_interval"),
        exploration_fraction=params.get("exploration_fraction"),
        exploration_final_eps=params.get("exploration_final_eps"),
        learning_starts=params.get("learning_starts"),
        batch_size=params.get("batch_size"),
        exploration_initial_eps=params.get("exploration_initial_eps"),
        double_q=True,
        prioritized_replay=True,
        prioritized_replay_alpha=0.6,
        prioritized_replay_beta0=0.4,
        prioritized_replay_beta_iters=None,
        prioritized_replay_eps=1e-06,
        train_freq=params.get("train_freq"),
        policy_kwargs=policy_kwargs)

    model.learn(total_timesteps=params.get("train_steps"), log_interval=10)
    model.save(exp_name)
    env.close()
    del env
Example #20
0
def AirRaid_main():
    env = retro.make('AirRaid-Atari2600', use_restricted_actions=retro.Actions.DISCRETE)
    model = DQN(CnnPolicy, env, verbose=1)
    model.learn(total_timesteps=25000)
    model.save("AirRaid_Model")

    del model

    model = DQN.load("AirRaid_Model")

    obs = env.reset()
    while True:
        action, _states = model.predict(obs)
        obs, rew, done, info = env.step(action)
        #env.render()
        if done:
            obs = env.reset()
        env.close()
Example #21
0
def train(algorithm='dqn', timesteps=2e5):

    # env = gym.make('LunarLander-v2')  # This uses the library version of the Lunar Lander env.
    print('algorithm: ', algorithm)
    print('timesteps: ', timesteps)

    learning_rate = 0.001

    if algorithm.lower() == 'dqn':
        env = LunarLander()
        model = DQN('MlpPolicy', env, learning_rate=learning_rate,
                    prioritized_replay=True,
                    verbose=1)
    elif algorithm.lower() == 'ppo2':
        n_envs = 4
        env = SubprocVecEnv([lambda: LunarLander() for i in range(n_envs)])

        schedule = LinearSchedule(int(float(timesteps)), 0.00001, 0.1).value
        model = PPO2('MlpPolicy', env, learning_rate=schedule,
                     verbose=1)
    else:
        raise RuntimeError("Unknown algorithm. %s" % algorithm)

    # mean_reward, std_reward = evaluate_policy(
    #     model, model.get_env(), n_eval_episodes=10)

    # Train the agent
    model.learn(total_timesteps=int(float(timesteps)), log_interval=10)
    # Save the agent
    model.save("trained_models/latest")

    now = datetime.now()
    dt_string = now.strftime("%Y-%m-%d_%H-%M-%S")
    model.save("trained_models/lunar_climber_%s-%s" %
               (algorithm.lower(), dt_string))

    # #lot training progress
    # plt.plot(env.all_rewards)
    # plt.ylabel('Reward')
    # plt.xlabel('Timesteps')
    # plt.savefig('figures/stats-%s.png' % dt_string)

    print("Model trained!")
Example #22
0
def sb_model_train(rl_manager):
    env = CustomEnv(rl_manager)
    env = make_vec_env(lambda: env, n_envs=1)
    model = DQN(CustomPolicy,
                env,
                verbose=1,
                learning_starts=256,
                batch_size=256,
                exploration_fraction=0.5,
                target_network_update_freq=10,
                tensorboard_log='./Logs/')
    # model = DQN(MlpPolicy, env, verbose=1, learning_starts=64,  target_network_update_freq=50, tensorboard_log='./Logs/')
    # model = DQN.load("DQN_Model_SimpleSim_30k",env=env,exploration_fraction=0.1,tensorboard_log='./Logs/')
    model.learn(total_timesteps=10000)
    # model = PPO2(MlpPolicy, env, verbose=1,tensorboard_log="./Logs/")
    # model.learn(total_timesteps=20000)
    model.save(dir_path + "/DQN_Model_SimpleSim")
    # sb_model_test(rl_manager)
    return
def run_model(algorithm, training_timesteps, testing_timesteps, training_iterations, testing_iterations, learning_rate, batch_size):
	columns = ['Mean Rewards', 'Standard deviation'] 
	df = pd.DataFrame(columns=columns)
	if (algorithm == "PPO2"):
	    from stable_baselines.common.policies import MlpPolicy
	    model = PPO2(MlpPolicy, env, verbose=1, learning_rate=learning_rate, tensorboard_log = "./{}_rcrs_tensorboard/".format(hostname), n_steps = batch_size)
	else:
	    from stable_baselines.deepq.policies import MlpPolicy
	    model = DQN(MlpPolicy, env, verbose=1, learning_rate=learning_rate, tensorboard_log = "./{}_rcrs_tensorboard/".format(hostname),  batch_size = batch_size)
	for k in range(training_iterations):
		# Train the agent
		model.learn(total_timesteps=int(training_timesteps))
		# Saving the model 
		model.save("{}_{}_{}_{}".format("rcrs_wgts", k, algorithm, hostname))
		subprocess.Popen(path_for_kill_file, shell=True)

	for j in range(testing_iterations):
	    # Load the trained agent
	    if (algorithm == "PPO2"):
	    	model = PPO2.load("{}_{}_{}_{}".format("rcrs_wgts", j, algorithm, hostname))
	    else:
	    	model = DQN.load("{}_{}_{}_{}".format("rcrs_wgts", j, algorithm, hostname))
	    # Reset the environment
	    obs = env.reset()
	    # Create an empty list to store reward values 
	    final_rewards = []
	    for _ in range(testing_timesteps):
	        # predict the values
	        action, _states = model.predict(obs)
	        obs, rewards, dones, info = env.step(action)
	        if dones == True:
	            final_rewards.append(rewards)
	    # Print the mean reward
	    print(np.mean(final_rewards))
	    # Print the standard deviation of reward
	    print(np.std(final_rewards))
	    # Create a DataFrame to save the mean and standard deviation
	    df = df.append({'Mean Rewards': np.mean(final_rewards), 'Standard deviation': np.std(final_rewards)}, ignore_index=True)
	    df.to_csv("{}_{}_{}".format(algorithm, hostname, "MeanAndStdReward.csv", sep=',',index=True))
	    
	    subprocess.Popen(path_for_kill_file, shell=True)
	subprocess.Popen(path_for_kill_file, shell=True)
Example #24
0
def trainAgent(env, agent):
    model = DQN(
        env=env,
        policy=MlpPolicy,
        verbose=1,
        learning_rate=
        0.1,  # alpha: If your learning rate is set too low, training will progress very slowly as you are making very tiny updates to the weights in your network. However, if your learning rate is set too high, it can cause undesirable divergent behavior in your loss function.
        gamma=
        0.95,  # It controls the importance of the future rewards versus the immediate ones.
        exploration_initial_eps=1.0,
        exploration_fraction=0.8,
        exploration_final_eps=0.1,
        buffer_size=56,
        batch_size=50)

    # treinamento com 5 fluxos de 300M
    agent_string = 'DQN-flow-byte-count-' + agent
    model.learn(total_timesteps=10000)  #5000
    model.save('./trained-agents/' + agent_string)
    print('Modelo treinado e salvo: ', agent_string)
Example #25
0
def traindqn(args):
    '''
    An example with an agent which requires a single agent setup
    '''
    with tf.device('/gpu:0'):
        env = gym.make('python_1p-v0')
        env = SAhandler(env)

        model = DQN(DqnCnnPolicy,
                    env,
                    verbose=1,
                    learning_rate=5e-4,
                    exploration_fraction=0.1,
                    exploration_final_eps=0.01,
                    buffer_size=50000,
                    train_freq=1,
                    prioritized_replay=True,
                    target_network_update_freq=1000)
        model.learn(int(1e6))
        model.save("dqnwithcnn", cloudpickle=True)
Example #26
0
def train_single(cfg, version, load_model=None):
    gym_wrapper = CustomEnv(cfg)
    if load_model is None:
        model = DQN(MlpPolicy,
                    gym_wrapper,
                    verbose=1,
                    double_q=cfg["double-dqn"],
                    prioritized_replay=cfg["prioritized"],
                    policy_kwargs=dict(dueling=cfg["dueling"]),
                    exploration_fraction=cfg["exploration_frac"])
        # tensorboard_log=cfg["study_results"] + "tensorboard/experiments/")
    else:
        model = DQN.load("{}models/single_dqn_transport".format(
            cfg["study_results"]),
                         env=gym_wrapper)

    model.learn(total_timesteps=cfg["timesteps"],
                tb_log_name=cfg["experiment_name"])
    model.save("{0}models/{2}-v{1}".format(cfg["study_results"], version,
                                           cfg["experiment_name"]))
Example #27
0
def run():
    # hyperparameters
    gamma = 0.99  #discount factor
    learning_rate = 0.00025  #learning rate for adam optimizer
    buffer_size = 50000  #size of the replay buffer
    exploration_fraction = 0.1  #fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps = 0.02  #final value of random action probability
    exploration_initial_eps = 1.0  #initial value of random action probability
    train_freq = 1  #update the model every train_freq steps. set to None to disable printing
    batch_size = 32  #size of a batched sampled from replay buffer for training
    double_q = True  #whether to enable Double-Q learning or not.
    learning_starts = 100  #how many steps of the model to collect transitions for before learning starts
    timesteps = 1000  #2000
    verbose = 1

    env = gym.make('Boxoban-Train-v1')

    model = DQN(MlpPolicy,
                env,
                gamma=gamma,
                learning_rate=learning_rate,
                buffer_size=buffer_size,
                exploration_fraction=exploration_fraction,
                exploration_final_eps=exploration_final_eps,
                exploration_initial_eps=exploration_initial_eps,
                train_freq=train_freq,
                batch_size=batch_size,
                double_q=double_q,
                learning_starts=learning_starts,
                verbose=1)
    model.learn(total_timesteps=timesteps)
    model.save("trained_models/dqn_sokoban_model")

    # Enjoy trained agent
    obs = env.reset()
    print(model.action_probability(obs))
    while True:
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        env.render()
def main():
    # create the environment
    env = gym.make("gym_balanceBot-v0")

    if os.path.isfile("trained_model/dqn_balanceBot.zip") == False:
        # Instantiate the agent
        model = DQN('MlpPolicy',
                    env,
                    learning_rate=1e-3,
                    prioritized_replay=True,
                    verbose=1)

        # Train the agent
        model.learn(total_timesteps=int(2e5))
        # Save the agent
        model.save("trained_model/dqn_balanceBot")
        del model  # delete trained model to demonstrate loading

        # Load the trained agent
        model = DQN.load("trained_model/dqn_balanceBot")

        # Evaluate the agent
        mean_reward, std_reward = evaluate_policy(model,
                                                  model.get_env(),
                                                  n_eval_episodes=10)

    else:
        # Load the trained agent
        model = DQN.load("trained_model/dqn_balanceBot")

    # Enjoy trained agent
    obs = env.reset()
    for i in range(3000):
        action, states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        env.render()
        sleep(1. / 240.)

    env.close()
Example #29
0
    def leveltrain(self, from_level, to_level, env, timesteps, level_modelpath,
                   tensorboard_logs_path):
        model = DQN('MlpPolicy',
                    env,
                    verbose=1,
                    policy_kwargs=self.policy_kwargs,
                    prioritized_replay=True,
                    buffer_size=100000,
                    learning_rate=0.0003,
                    exploration_final_eps=0,
                    tensorboard_log=tensorboard_logs_path)
        model.save(level_modelpath)

        for current_level in range(
                from_level,
                to_level + 1):  # Train model for increasingly difficult levels
            env = gym.make('DeepWellEnvSpherlevel' + str(current_level) +
                           '-v0')

            model = self.load(level_modelpath,
                              tensorboard_logs_path)  # Load previous model
            env_str = self.get_env_str(env)
            model.set_env(make_vec_env(env_str, n_envs=1))
            model.learn(total_timesteps=timesteps,
                        reset_num_timesteps=False,
                        tb_log_name="TB_" +
                        datetime.now().strftime('%d%m%y-%H%M')
                        )  # Continue training previous model

            level_modelpath = level_modelpath[0:-1] + str(
                current_level)  # Generate new name of newly trained model
            model.save(level_modelpath)  # Save newly trained model

            print("====================== Level " + str(current_level) +
                  " finished with " + str(timesteps) +
                  " timesteps ==========================")

        return model
Example #30
0
def train_DQN():
    simulation_start_time = time.time()
    model = DQN(MlpPolicy,
                env,
                verbose=1,
                tensorboard_log="./gym_jobshop_tensorboard_logs/")
    custom_callback = CustomCallback()
    # model = MlpPolicy
    # Call Tensorboard logs from a terminal in folder "masterarbeit" (root folder of the project)
    # tensorboard --logdir ReinforcementLearning/gym_jobshop_tensorboard_logs/DQN_1

    # keyboard input: was will der user machen
    # a) trainiere für x steps
    # 10000
    # b) gebe aktuelle werte aus
    # dqn proba_step(aktuellster state vom environment als observation + 2 fixe observations
    # dqn step(s.o.
    model.learn(total_timesteps=10000, callback=custom_callback)

    model.save("deepq_jobshop")
    print("Training finished after " +
          str(round(time.time() - simulation_start_time, 4)) + " seconds")
    return