Exemple #1
0
def run_model(params, rollout_size=50, num_steps=50):
    """Perform the training operation.

    Parameters
    ----------
    params : dict
        flow-specific parameters (see flow/utils/registry.py)
    rollout_size : int
        length of a single rollout
    num_steps : int
        total number of training steps

    Returns
    -------
    stable_baselines.*
        the trained model
    """
    constructor = env_constructor(params, version=0)()
    env = DummyVecEnv([lambda: constructor])

    model = TRPO(
        'MlpPolicy',
        env,
        verbose=2,
        timesteps_per_batch=rollout_size,
        gamma=0.999,
        policy_kwargs={
            "net_arch": [100, 50, 25]
        },
    )
    model.learn(total_timesteps=num_steps)

    return model
Exemple #2
0
def train_trpo(seed):
    """
    test TRPO on the uav_env(cartesian,discrete)
    """
    """
    TRPO(policy, env, gamma=0.99, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, 
    lam=0.98, entcoeff=0.0, cg_damping=0.01, vf_stepsize=0.0003, vf_iters=3, verbose=0, 
    tensorboard_log=None, _init_setup_model=True)
    """
    algo = 'TRPO'
    num_timesteps = 3000000

    env = set_up_env(seed)

    global best_mean_reward, n_steps
    best_mean_reward, n_steps = -np.inf, 0

    # Tested with: timesteps_per_batch=1024
    model = TRPO(policy=MlpPolicy, env=env, gamma=0.99, timesteps_per_batch=128,
                 max_kl=0.01, cg_iters=10, lam=0.98, entcoeff=0.0, cg_damping=0.01,
                 vf_stepsize=0.0003, vf_iters=3, verbose=0,
                 tensorboard_log="./logs/{}/tensorboard/{}/".format(EXPERIMENT_NATURE, algo))

    model.learn(total_timesteps=num_timesteps, callback=callback, seed=seed,
                log_interval=500, tb_log_name="seed_{}".format(seed))

    model = TRPO.load(log_dir + 'best_model.pkl')

    evaluation = evaluate_model(env, model, 100)
    os.makedirs('./logs/{}/csv/{}/'.format(EXPERIMENT_NATURE, algo), exist_ok=True)
    os.rename('/tmp/gym/monitor.csv', "./logs/{}/csv/{}/seed_{}.csv".format(EXPERIMENT_NATURE, algo, seed))
    env.close()
    del model, env
    gc.collect()
    return evaluation
Exemple #3
0
def run_experiment(args):

    randomization_settings = {
        "engagement_distance": (100, 100),
        "turnframes": (args.turnframes, args.turnframes)
    }

    if args.randomize_engagement:
        randomization_settings["engagement_distance"] = (100, 200)

    vecEnv = None
    if args.num_envs == 1:
        # Create dummyvecenv
        env = gym.make(args.env)
        env = Monitor(
            TorilleWrapper(env, 100, args.experiment_name,
                           randomization_settings), args.experiment_name)
        vecEnv = DummyVecEnv([
            lambda: env
        ])  # The algorithms require a vectorized environment to run
    else:
        vecEnv = []

        def make_env():
            env = gym.make(args.env)
            unique_id = str(time.time())[-6:]
            experiment_env_name = args.experiment_name + ("_env%s" % unique_id)
            return Monitor(
                TorilleWrapper(env, 100, experiment_env_name,
                               randomization_settings), experiment_env_name)

        for i in range(args.num_envs):
            vecEnv.append(make_env)

        vecEnv = SubprocVecEnv(vecEnv)

    steps_per_env = args.steps_per_batch // args.num_envs

    # Standard 2 x 64 network with sigmoid activations
    policy_kwargs = dict(act_fun=tf.nn.sigmoid, net_arch=[64, 64])
    model = None
    if args.agent == "ppo":
        model = PPO2(MlpPolicy,
                     vecEnv,
                     policy_kwargs=policy_kwargs,
                     ent_coef=args.ent_coef,
                     n_steps=steps_per_env,
                     verbose=1)
    elif args.agent == "trpo":
        model = TRPO(MlpPolicy,
                     vecEnv,
                     policy_kwargs=policy_kwargs,
                     entcoeff=args.ent_coef,
                     timesteps_per_batch=steps_per_env,
                     verbose=1)

    model.learn(total_timesteps=args.timesteps)
Exemple #4
0
def trpo(env_id,
         timesteps,
         policy="MlpPolicy",
         log_interval=None,
         tensorboard_log=None,
         seed=None):
    from stable_baselines import TRPO
    env = gym.make(env_id)

    model = TRPO(policy, env, verbose=1, tensorboard_log=tensorboard_log)
    model.learn(total_timesteps=timesteps, log_interval=log_interval)

    save_model_weights(model, "trpo", env_id, policy, seed)
Exemple #5
0
def trpo(env_id, log_dir, timesteps):
    # Create log dir
    os.makedirs(log_dir, exist_ok=True)

    # Create and wrap the environment
    env = gym.make(env_id)
    env = Monitor(env, log_dir, allow_early_resets=True)
    env = DummyVecEnv([lambda: env])

    model = TRPO(MlpPolicy, env, verbose=0)
    # Train the agent
    print("Beginning training episodes with TRPO.")
    model.learn(total_timesteps=timesteps)

    env.close()
Exemple #6
0
def run_model(config, budget):
    """
       Initializes the environment in which the model is evaluated, retrieves the values 
       for the current hyperparameter configuration, initializes and trains
       the given model. 


        Parameters:
        --------
            config: ConfigSpace object containing sampled values for a given hyperparameter configuration
            budget: how much of a full run is currently used to estimate mean loss
        
        Returns:
        --------
            A metric used to evaluate the performance of the current configuration. 
    """
    # Fixed random state
    rand_state = np.random.RandomState(1).get_state()
    np.random.set_state(rand_state)
    seed = np.random.randint(1, 2**31 - 1)
    tf.set_random_seed(seed)
    random.seed(seed)

    env = gym.make('CartPole-v1')
    env = DummyVecEnv([lambda: env])

    # Get all the current hyperparameter values
    config['timesteps_per_batch'] = config['timesteps_per_batch']
    for parameter_name in ['vf_stepsize', 'max_kl', 'gamma', 'lam']:
        config[parameter_name] = float(config[parameter_name])

    # Initialize model
    model = TRPO(MlpPolicy,
                 env,
                 verbose=1,
                 timesteps_per_batch=config['timesteps_per_batch'],
                 vf_stepsize=config['vf_stepsize'],
                 max_kl=config['max_kl'],
                 gamma=config['gamma'],
                 lam=config['lam'])

    total_timesteps = 10000
    budget_steps = int(total_timesteps *
                       budget)  #I am not sure this is the right way to do it
    model.learn(total_timesteps=budget_steps)

    result = evaluate(env, model)
    return result
Exemple #7
0
def load_model(path: str, env, desc: str):
    """ Loads a model from a stable baseline checkpoint file into a memory representation 

    Args:
        path        (str)           :       Path to the Stable Baseline Checkpoint File 
        env         (SB Env)        :       Path to the Stable Baseline Checkpoint File 
        desc        (str)           :       Text Description of what model this is

    Returns:
        The loaded model
    """

    if desc == "ddpg":
        return DDPG.load(path, env)
    elif desc == "ppo":
        env = DummyVecEnv([lambda: env])
        return PPO2.load(path, env)
    elif desc == "trpo":
        env = DummyVecEnv([lambda: env])
        return TRPO.load(path, env)
    elif desc == "td3":
        return TD3.load(path, env)
    elif desc == "sac":
        return SAC.load(path, env)
    else:
        raise RuntimeError(f"Model Name {desc} not supported")
Exemple #8
0
def test_models(env):
    # seeds = [1, 2, 3]
    seeds = [1]

    for s in seeds:
        # Load Models
        # models = [A2C.load(f'data/models/a2c_{s}'),
        #           ACKTR.load(f'data/models/acktr_{s}'),
        #           DDPG.load(f'data/models/ddpg_{s}'),
        #           PPO2.load(f'data/models/ppo_{s}'),
        #           SAC.load(f'data/models/sac_{s}'),
        #           TD3.load(f'data/models/td3_{s}'),
        #           TRPO.load(f'data/models/trpo_{s}')]

        models = [PPO2.load(f'data/models/ppo_{s}'), SAC.load(f'data/models/sac_{s}'), TD3.load(
            f'data/models/td3_{s}'), TRPO.load(f'data/models/trpo_{s}')]

        for m in models:
            # run_policy(m, env)
            og_params = m.get_parameters()
            generalization_test(m, env)

            for i in range(50):
                params = prune_policy(m.__class__.__name__, og_params, 0.1)
                m.load_parameters(params)
                generalization_test(m, env)
    def my_compute_data(self, args, env, params, n_episodes):
        env = gym.make('gym_quadcopter:quadcopter-v' + str(args.env))
        for alg, start_index, end_index, step, suffix in params:
            re_d = []
            sr_d = []
            rewards, s_rates = [], []
            for i in range(start_index, end_index, step):
                print("")
                print(
                    f"Working on alg={alg}, start_index={start_index}, end_index={end_index}, step={step}, suffix={suffix}, i={i}"
                )
                path = f"{self.base_dir}models/{alg}/quadcopter-v{args.env}-{i}{suffix}.pkl"
                print(f"Evaluating model at {path}")
                if not os.path.exists(path):
                    print(f"WARNING: File {path} does not exist --> SKIPPING")
                    continue

                if alg == "ddpg":
                    model = DDPG.load(path)
                elif alg == "ppo":
                    model = PPO2.load(path)
                else:
                    model = TRPO.load(path)
                r, su = mean_eval(n_episodes, model, env, False, False)
                print(f"Average Success Rate: {su}")
                rewards.append(r)
                s_rates.append(su[0])

            i_max = np.argmax(s_rates)
            re_d.append(rewards)
            sr_d.append(s_rates)
            return re_d, sr_d
Exemple #10
0
def mainUp(arg):
    test = arg == TEST
    
    env = fet.FurutaEnvPosTrpoUp(cm.RUN, render = not test) 
    #env.setRender(True)
    model = TRPO.load(POLICY_PATH + "trpo_pos_policy_up.zip")
    
    buf_rew = []
    test_cutoff_count = 0
    test_count = 0
    overspeed = 0
    total_count = 0
    while True:
        test_count += 1
        if test and test_count >= TEST_COUNT_UP:
            print("\n***Average reward: %.3f\tAverage count: %.3f\tShort runs: %d" % (sum(buf_rew)/float(len(buf_rew)), total_count/float(test_count), test_cutoff_count - overspeed))
            break
            
        obs, done = env.reset(), False
        episode_rew = 0
        count = 0
        while not done:
            action, _ = model.predict(obs)
            obs, rew, done, _ = env.step(action)
            if speedCheck(obs):
                overspeed += 1
            episode_rew += rew
            count += 1
            total_count += 1
        buf_rew.append(episode_rew)
        if test and count <= TEST_CUTOFF_MAX:
            test_cutoff_count += 1
        print("Episode average reward: %.3f\tCount: %d" % (episode_rew/count, count))
Exemple #11
0
def test(model_path: str, exp_config: dict):

    test_env, _ = init_env(exp_config)

    if ALG == 'ddpg':
        model = DDPG.load(model_path, env=test_env)
    elif ALG == 'trpo':
        model = TRPO.load(model_path, env=test_env)
    elif ALG == 'ppo2':
        model = PPO2.load(model_path, env=test_env)
    elif ALG == 'her':
        # model = HER.load(model_path, env=test_env)
        raise NotImplemented()
    else:
        raise ValueError(f'Unknown algorithm "{ALG}"!')

    monitor = test_env.envs[0]  # type: Monitor
    assert isinstance(monitor, Monitor)

    raw_env = monitor.unwrapped  # type: GaussianPendulumEnv
    assert isinstance(raw_env, GaussianPendulumEnv)

    raw_env.configure(seed=42,
                      mass_mean=(0.05, 1.5),
                      mass_stdev=(0.01, 0.15),
                      embed_knowledge=exp_config.get('embed_knowledge', False),
                      perfect_knowledge=exp_config.get('perfect_knowledge',
                                                       False),
                      gym_env=test_env)

    runs = np.zeros((TEST_RUNS, 4))
    fixed_masses = np.linspace(0.030, 1.600, TEST_RUNS)

    for test_ep in range(runs.shape[0]):

        obs = test_env.reset()

        if TEST_LINSPACE_MASS:
            p = raw_env.physical_props
            raw_env.physical_props = p[0], fixed_masses[test_ep], p[2]

        mass_distr_params = raw_env.mass_distr_params.copy()
        sampled_mass = raw_env.physical_props[1]

        while True:
            action, states = model.predict(obs, deterministic=True)
            obs, rewards, dones, info = test_env.step(action)
            rewards_by_episode = monitor.episode_rewards
            episode = len(rewards_by_episode)
            if episode != test_ep:
                break

        last_tot_reward = rewards_by_episode[-1]
        runs[test_ep, :] = mass_distr_params[0], mass_distr_params[
            1], sampled_mass, last_tot_reward

    avg_reward = runs[:, 3].mean()
    print(f'Avg. test reward: {avg_reward}\n')

    return runs
Exemple #12
0
def test_action_mask_run_trpo(vec_env, policy, env_class):
    env = vec_env([env_class])

    model = TRPO(policy, env, verbose=0)

    obs, done, action_masks = env.reset(), [False], []
    while not done[0]:
        action, _states = model.predict(obs, action_mask=action_masks)
        obs, _, done, infos = env.step(action)

        action_masks.clear()
        for info in infos:
            env_action_mask = info.get('action_mask')
            action_masks.append(env_action_mask)

    env.close()
Exemple #13
0
def test(testing_data, model_file, result):
    model = TRPO.load(model_file)

    # set testing environment
    stock_test_data = StocksData.read_csv(testing_data)
    stocks_test_env = StocksEnv(stock_test_data,
                                bars_count=10,
                                reset_on_close=False)
    obs = stocks_test_env.reset()

    # set vars for recording results
    result_df = pandas.DataFrame([],
                                 columns=['date', 'open', 'action', 'reward'])
    net_reward = 0.0

    while True:
        action, _states = model.predict(obs)
        obs, reward, done, info = stocks_test_env.step(action)

        # print and record the offset, action taken, reward, opening price
        df = pandas.DataFrame([[
            stock_test_data.date[int(info["offset"])],
            stock_test_data.open[int(info["offset"])],
            Actions(action).name, reward
        ]],
                              columns=['date', 'open', 'action', 'reward'])
        print(df)
        result_df = result_df.append(df, ignore_index=True)
        net_reward += reward

        # at end of episode, record results and exit
        if done:
            print('Net Reward: ', net_reward)
            result_df.to_csv(result, index=False)
            break
Exemple #14
0
def trpo(env, seed):
    return TRPO('MlpPolicy',
                env,
                vf_iters=5,
                vf_stepsize=0.001,
                verbose=1,
                tensorboard_log="./data/runs",
                seed=seed)
Exemple #15
0
def train(params):

    env = FlattenObservation(gym.make(params.get("environment")))
    exp_name = params.get("model_name") + "_train_" + params.get("environment")
    log_dir = './logs/' + exp_name
    expert_name = 'expert_{0}'.format(exp_name)

    if params.get("expert_name") == 'TRPO':
        print("Loading TRPO Model")
        model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log=log_dir)

    if params.get("expert_name") == 'PPO':
        print("Loading PPO Model")
        model = PPO1(MlpPolicy,
                     env,
                     verbose=1,
                     tensorboard_log=log_dir,
                     entcoeff=params.get("ent_coef"),
                     gamma=params.get("gamma"),
                     optim_batchsize=params.get("batch_size"),
                     clip_param=params.get("clip_range"),
                     lam=params.get("gae_lambda"))
    if params.get("expert_name") == 'TRPO' or params.get(
            "expert_name") == 'PPO':
        print("Training expert trajectories")
        # Train expert controller (if needed) and record expert trajectories.
        generate_expert_traj(model,
                             expert_name,
                             n_timesteps=params.get("expert_timesteps"),
                             n_episodes=params.get("n_episodes"))

    dataset = ExpertDataset(
        expert_path='{0}.npz'.format(expert_name),
        traj_limitation=-1,
        randomize=True,  # if the dataset should be shuffled
        verbose=1)

    model = GAIL('MlpPolicy', env, dataset, verbose=1,
                 tensorboard_log=log_dir)  # Check out for defaults

    if params.get("pre_train") is True:
        print("Pretraining Dataset with Behavioural Cloning")
        model.pretrain(dataset, n_epochs=1000)

    print("Executing GAIL Learning")
    model.learn(total_timesteps=params.get("train_steps"))
    model.save(exp_name)

    env.close()
    del env
def render_to_gif():
    def save_frames_as_gif(frames,
                           path='./',
                           filename='growspace_with_trpo.gif'):
        # Mess with this to change frame size
        plt.figure(figsize=(frames[0].shape[1] / 72.0,
                            frames[0].shape[0] / 72.0),
                   dpi=72)

        patch = plt.imshow(frames[0])
        plt.axis('off')

        def animate(i):
            patch.set_data(frames[i])

        anim = animation.FuncAnimation(plt.gcf(),
                                       animate,
                                       frames=len(frames),
                                       interval=50)
        anim.save(path + filename, writer='imagemagick', fps=60)

    env = gym.make('GrowSpaceEnv-Control-v0')
    model = TRPO(MlpPolicy, env, verbose=1)
    # model.learn(total_timesteps=2500)
    # model.save("trpo_cartpole")

    # del model  # remove to demonstrate saving and loading

    model = TRPO.load("trpo_cartpole")

    frames = []
    obs = env.reset()
    for _ in range(150):
        # while True:
        frames.append(env.render(mode="rgb_array"))

        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        # if done:
        #     break
        # env.render()

    env.close()
    save_frames_as_gif(frames)
Exemple #17
0
def train(env_id, num_timesteps, seed):
    """
    Train TRPO model for the atari environment, for testing purposes

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    """
    rank = MPI.COMM_WORLD.Get_rank()

    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])

    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = make_atari(env_id)

    env = bench.Monitor(
        env,
        logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
    env.seed(workerseed)

    env = wrap_deepmind(env)
    env.seed(workerseed)

    model = TRPO(CnnPolicy,
                 env,
                 timesteps_per_batch=512,
                 max_kl=0.001,
                 cg_iters=10,
                 cg_damping=1e-3,
                 entcoeff=0.0,
                 gamma=0.98,
                 lam=1,
                 vf_iters=3,
                 vf_stepsize=1e-4)
    model.learn(total_timesteps=int(num_timesteps * 1.1))
    env.close()
    # Free memory
    del env
Exemple #18
0
def loader(algo, env_name):
    if algo == 'dqn':
        return DQN.load("trained_agents/" + algo + "/" + env_name + ".pkl")
    elif algo == 'ppo2':
        return PPO2.load("trained_agents/" + algo + "/" + env_name + ".pkl")
    elif algo == 'a2c':
        return A2C.load("trained_agents/" + algo + "/" + env_name + ".pkl")
    elif algo == 'acer':
        return ACER.load("trained_agents/" + algo + "/" + env_name + ".pkl")
    elif algo == 'trpo':
        return TRPO.load("trained_agents/" + algo + "/" + env_name + ".pkl")
def render_growspace_with_trpo():
    env = gym.make('GrowSpaceEnv-Control-v0')
    model = TRPO(MlpPolicy, env, verbose=1)
    # model.learn(total_timesteps=2500)
    # model.save("trpo_cartpole")
    #
    # del model  # remove to demonstrate saving and loading

    model = TRPO.load("trpo_cartpole")

    obs = env.reset()
    for t in range(150):
        print(t)
        # while True:
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)

        # if dones:
        #     env.reset()
        env.render()
Exemple #20
0
def trpo(env_id,
         timesteps,
         policy="MlpPolicy",
         log_interval=None,
         tensorboard_log=None,
         seed=None,
         load_weights=None):
    from stable_baselines import TRPO
    env = gym.make(env_id)

    if load_weights is not None:
        model = TRPO.load(load_weights, env=env, verbose=0)
    else:
        model = TRPO(policy, env, verbose=1, tensorboard_log=tensorboard_log)

    callback = WandbRenderEnvCallback(model_name="trpo", env_name=env_id)

    model.learn(total_timesteps=timesteps,
                log_interval=log_interval,
                callback=callback)
Exemple #21
0
def mainHybrid(arg):
    test = arg == TEST
    
    env = fet.FurutaEnvPosTrpo(cm.RUN, render = not test) 
    #env.setRender(True)
    modelBal = TRPO.load(POLICY_PATH + "trpo_pos_policy_bal.zip")
    modelUp = TRPO.load(POLICY_PATH + "trpo_pos_policy_up.zip")

    buf_rew = []
    test_cutoff_count = 0
    test_count = 0
    overspeed = 0
    complete_count = 0
    while True:
        test_count += 1
        if test and test_count >= TEST_COUNT_HYBRID:
            print("\n***Average reward: %.3f\tLong runs: %d\tComplete: %d" % (sum(buf_rew)/float(len(buf_rew)), test_cutoff_count - overspeed, complete_count))
            break
            
        obs, done = env.reset(), False
        episode_rew = 0
        count = 0
        while not done:
            if abs(obs[2]) > cm.deg2Rad(cm.ANGLE_TERMINAL_MIN_D):
                action, _ = modelUp.predict(obs)
            else:
                action, _ = modelBal.predict(obs)
                
            obs, rew, done, _ = env.step(action)
            
            if speedCheck(obs):
                overspeed += 1
                
            episode_rew += rew
            count += 1
        if count > 999:
            complete_count += 1
        buf_rew.append(episode_rew)
        if test and count >= TEST_CUTOFF_MAX:
            test_cutoff_count += 1
        print("Episode reward: %.3f" % (episode_rew))
Exemple #22
0
    def f_checkpoints_range_2_mean_performance(
            self, checkpoints: range) -> Tuple[np.ndarray, np.ndarray]:
        logging.debug(
            f"[f_checkpoints_range_2_mean_performance]: checkpoints={checkpoints}"
        )
        rewards = np.zeros(len(checkpoints))
        s_rates = np.zeros(len(checkpoints))
        # Intent
        # - Iterate over this range, to load the associated Stable Baseline Model Checkpoint
        # - Pass that model to `mean_eval` evaluation function which will evaluate the model on
        #   - a certain number of episodes
        #   - a certain env
        #    - continuous or not continuous space
        # - an evaluation returns reward and average success rate
        #
        # Evaluating N checkpoints on M queries and then averaging on M so to finally have N Rewards and N Success Rates

        j = 0
        """ NOTE: i can range in anyway while j iterates over the numpy array 
        """
        for i in checkpoints:
            path = f"{self.args.training_base_path}/models/quadcopter-{i}{self.args.suffix}"
            logging.debug(f"Evaluating model at {path}")
            if self.args.model['name'] == "ddpg":
                model = DDPG.load(path)
            elif self.args.model['name'] == "ppo":
                model = PPO2.load(path)
            elif self.args.model['name'] == "trpo":
                model = TRPO.load(path)
            elif self.args.model['name'] == "td3":
                model = TD3.load(path)
            elif self.args.model['name'] == "sac":
                model = SAC.load(path)
            logging.debug(
                f"Evaluating Model {self.args.model['name']} for {self.args.n_episodes} episodes in {self.args.env} environment with continuous={str(self.args.continuous)}"
            )
            rewards_list, success_rates_list = mean_eval(
                num_episodes=self.args.n_episodes,
                checkpoint_id=i,
                model=model,
                env=self.env,
                v=True,
                continuous=self.args.continuous,
                plots_dir=self.args.plots_dir)
            rewards_mean = np.mean(rewards_list)
            success_rates_mean = np.mean(success_rates_list)
            logging.debug(
                f"Evaluation Checkpoint={i} --> Average Reward = {rewards_mean}, Average Success Rate = {success_rates_mean}"
            )
            rewards[j] = rewards_mean
            s_rates[j] = success_rates_mean
            j += 1
        return rewards, s_rates
def optimize_agent(trial):
    """ Train the model and optimise
        Optuna maximises the negative log likelihood, so we
        need to negate the reward here
    """
    model_params = optimize_ddpg(trial)
    seed = trial.suggest_int('numpyseed', 1, 429496729)
    np.random.seed(seed)
    original_env = gym.make('rustyblocks-v0')
    original_env.max_invalid_tries = 3
    env = DummyVecEnv([lambda: original_env])
    model = TRPO("MlpPolicy", env, verbose=0, **model_params)
    print("DOING LEARING trpo")
    original_env.force_progression = False
    model.learn(int(2e5), seed=seed)
    print("DONE LEARING trpo")
    original_env.max_invalid_tries = -1

    rewards = []
    n_episodes, reward_sum = 0, 0.0

    obs = env.reset()
    original_env.force_progression = True
    original_env.invalid_try_limit = 5000
    while n_episodes < 4:
        action, _ = model.predict(obs)
        obs, reward, done, _ = env.step(action)
        reward_sum += reward

        if done:
            rewards.append(reward_sum)
            reward_sum = 0.0
            n_episodes += 1
            obs = env.reset()

    last_reward = np.mean(rewards)
    trial.report(last_reward)

    return last_reward
Exemple #24
0
def launch_training(nb_cpu,name_agent,name_env,total_timesteps,text):

    env_name = name_env
    #n_cpu = 8
    n_cpu = nb_cpu

    policy_kwargs = dict(act_fun=tf.nn.tanh, net_arch=[512,512])

    print('TB available at := ',tensorboard_log_dir, file=sys.stderr)
    if name_agent =='A2C':
        env_ = FluidMechanicsEnv()
        env_ = Monitor(env_, console_log_dir,allow_early_resets=True)

        env = SubprocVecEnv([lambda: env_ for i in range(n_cpu)])
        model = A2C(MlpPolicy, env, n_steps=20,gamma = 0.9, verbose=1,tensorboard_log=tensorboard_log_dir, policy_kwargs=policy_kwargs)
        #model = A2C.load("first_test")
        model_name = "A2C_default_Mlp"+text
    elif name_agent == 'PPO2':
        env_ = FluidMechanicsEnv()
        env_ = Monitor(env_, console_log_dir,allow_early_resets=True)

        env = SubprocVecEnv([lambda: env_ for i in range(n_cpu)])
        model = PPO2(MlpPolicy, env,n_steps=80,gamma = 0.97, verbose=1,tensorboard_log=tensorboard_log_dir, policy_kwargs=policy_kwargs)
        #model = A2C.load("first_test")
        model_name = "PPO2_default_Mlp"+text
    elif name_agent == 'TRPO':
        env_ = FluidMechanicsEnv()
        env_ = Monitor(env_, console_log_dir,allow_early_resets=True)

        env = DummyVecEnv([lambda: env_ for i in range(n_cpu)])

        model = TRPO(MlpPolicy, env,gamma = 0.1, verbose=1,tensorboard_log=tensorboard_log_dir, policy_kwargs=policy_kwargs)
        #model = A2C.load("first_test")
        model_name = "TRPO_default_Mlp"+text


    time = datetime.now().strftime('%Y-%m-%d_%H_%M_%S')

    log_name = f"_model={model_name}_time={time}"
    print('with the following line := ','tensorboard --logdir ',tensorboard_log_dir+log_name)
    training_log = open(f"{console_log_dir}/{log_name}.log", "a")
    sys.stdout = training_log
    logging.basicConfig(level=logging.INFO, filename=f"{console_log_dir}/{log_name}.log", datefmt='%H:%M:%S',
                        format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s')
    model_file_name = f"{models_log_dir}{log_name}_best.pkl"


    start = datetime.now()
    print("Learning model", file=sys.stderr)

    model.learn(total_timesteps=int(total_timesteps), tb_log_name=log_name, callback=callback)

    training_time = datetime.now() - start
    print(f"Training time: {training_time}", file=sys.stderr)

    print("Saving final model", file=sys.stderr)
    model.save(f"{models_log_dir}{log_name}_final.pkl")
Exemple #25
0
def load_model(path: str, algorithm: str):
    from stable_baselines import PPO2, DQN, A2C, ACER, GAIL, TRPO
    if algorithm == 'PPO2':
        return PPO2.load(path)
    if algorithm == 'DQN':
        return DQN.load(path)
    if algorithm == 'A2C':
        return A2C.load(path)
    if algorithm == 'ACER':
        return ACER.load(path)
    if algorithm == 'GAIL':
        return GAIL.load(path)
    if algorithm == 'TRPO':
        return TRPO.load(path)
    return None
Exemple #26
0
def train_trpo(save_model=False):
    wandb.run = config.tensorboard.run
    wandb.tensorboard.patch(save=False, tensorboardX=True)

    env = gym.make(config.env_name)

    model = TRPO("CnnPolicy", env, verbose=1)
    model.learn(total_timesteps=config.num_updates,
                callback=WandbStableBaselines2Callback())
    if save_model:
        model.save(f"trpo_{config.env_name}")
Exemple #27
0
def train(training_data, training_timesteps, model_file):
    stocks_data = StocksData.read_csv(training_data)
    stocks_env = StocksEnv(stocks_data,
                           bars_count=DEFAULT_BARS_COUNT,
                           reset_on_close=False,
                           commission_perc=0.01)
    model = TRPO(MlpPolicy,
                 stocks_env,
                 verbose=1,
                 tensorboard_log="./tensorboard/")
    model.learn(total_timesteps=training_timesteps)
    model.save(model_file)
def main():
    # unpause Simulation so that robot receives data on all topics
    gazebo_connection.GazeboConnection().unpauseSim()
    # create node
    rospy.init_node('pickbot_gym', anonymous=True, log_level=rospy.FATAL)

    env = gym.make('Pickbot-v0')

    model = TRPO.load("pickbot_model_trpo_discrete_2019-03-11 10:22:01")

    while True:
        obs, done = env.reset(), False
        action, _states = model.predict(obs)
        episode_rew = 0
        while not done:
            obs, rewards, done, info = env.step(action)
            episode_rew += rewards
            print("Episode reward", episode_rew)
Exemple #29
0
def main():
    # unpause Simulation so that robot receives data on all topics
    gazebo_connection.GazeboConnection().unpauseSim()
    # create node
    rospy.init_node('pickbot_gym', anonymous=True, log_level=rospy.FATAL)

    env = gym.make('Pickbot-v0')

    model = TRPO(MlpPolicy, env, verbose=1)
    model.learn(total_timesteps=200000)

    print("Saving model to pickbot_model_trpo_discrete_"+timestamp+".pkl")
    model.save("pickbot_model_trpo_discrete_"+timestamp)
Exemple #30
0
 def create_trpo(self):
     return TRPO(MlpPolicy,
                 self.env,
                 gamma=0.99,
                 timesteps_per_batch=1024,
                 max_kl=0.01,
                 cg_iters=10,
                 lam=0.98,
                 entcoeff=0.0,
                 cg_damping=0.01,
                 vf_stepsize=0.0003,
                 vf_iters=3,
                 verbose=0,
                 tensorboard_log=config.ROOT_DIR + config.LOG_PATH,
                 _init_setup_model=True,
                 policy_kwargs=None,
                 full_tensorboard_log=False,
                 seed=None,
                 n_cpu_tf_sess=None)