Ejemplo n.º 1
0
def train_policy(num_of_envs, log_relative_path, maximum_episode_length,
                 skip_frame, seed_num, sac_config, total_time_steps,
                 validate_every_timesteps, task_name):
    task = generate_task(task_generator_id=task_name,
                         dense_reward_weights=np.array(
                             [250, 0, 125, 0, 750, 0, 0, 0.005]),
                         fractional_reward_weight=1,
                         goal_height=0.15,
                         tool_block_mass=0.02)
    env = CausalWorld(task=task,
                      skip_frame=skip_frame,
                      enable_visualization=False,
                      seed=seed_num,
                      max_episode_length=maximum_episode_length)
    set_global_seeds(seed_num)
    policy_kwargs = dict(layers=[256, 256])
    checkpoint_callback = CheckpointCallback(save_freq=int(
        validate_every_timesteps / num_of_envs),
                                             save_path=log_relative_path,
                                             name_prefix='model')
    model = SAC(MlpPolicy,
                env,
                verbose=1,
                policy_kwargs=policy_kwargs,
                **sac_config,
                seed=seed_num)
    model.learn(total_timesteps=total_time_steps,
                tb_log_name="sac",
                callback=checkpoint_callback)
    return
Ejemplo n.º 2
0
def train_SAC(env, out_dir, seed=None, **kwargs):

    # Logs will be saved in log_dir/monitor.csv
    global output_dir
    output_dir = out_dir
    log_dir = os.path.join(out_dir, 'log')
    os.makedirs(log_dir, exist_ok=True)
    env = make_mujoco_env(env, 0)
    env = Monitor(env, log_dir + "/")

    continue_train = False
    if continue_train:
        # Continue training
        print("Loading pretrained agent")
        model = SAC.load(os.path.join(out_dir, 'final_model.pkl'),
                         env=env,
                         tensorboard_log=os.path.join(log_dir, 'tb'),
                         verbose=1,
                         **kwargs)
    else:
        model = SAC(
            policy,
            env,  #action_noise=action_noise,
            verbose=1,
            tensorboard_log=os.path.join(log_dir, 'tb'),
            full_tensorboard_log=False,
            **kwargs)

    model.learn(total_timesteps=n_timesteps,
                seed=seed,
                callback=callback,
                log_interval=10)

    return model
Ejemplo n.º 3
0
def test_predict_SAC():
    '''
    Visualize predictions from a random policy.
    '''
    env = gym.make('KukaMujocoSAC-v0')
    model = SAC(SAC_MlpPolicy, env)
    obs = env.reset()
    while True:
        action, _ = model.predict(obs)
        obs, rew, done, info = env.step(action, render=True)
Ejemplo n.º 4
0
def sac(env_id,
        timesteps,
        policy="MlpPolicy",
        log_interval=None,
        tensorboard_log=None,
        seed=None):
    env = gym.make(env_id)

    model = SAC(policy, env, verbose=1, tensorboard_log=tensorboard_log)
    model.learn(total_timesteps=timesteps, log_interval=log_interval)

    save_model_weights(model, "sac", env_id, policy, seed)
Ejemplo n.º 5
0
def main(logdir):
    # params
    SLEEP_RATE = 100  #100Hz
    N_EPISODE = 1000
    EPISODE_TIME = 30
    EPISODE_LENGTH = SLEEP_RATE * EPISODE_TIME
    TOTAL_TIMESTEPS = EPISODE_LENGTH * N_EPISODE

    # logdir
    logdir = '/home/yliu2/rl_log/sac_mpc/ALT/3act/2'
    checkpoint_path = os.path.join(logdir, 'checkpoint')
    callback_path = logdir
    final_model_path = logdir + '/final_model'

    # env
    env = BlimpEnv(SLEEP_RATE)
    env = Monitor(env, logdir)
    # env = make_vec_env(lambda: env, n_envs=1, monitor_dir=logdir)
    print("Observation space:", env.observation_space)
    print("Shape:", env.observation_space.shape)
    print("Action space:", env.action_space)
    #
    # # callback
    SAVE_FREQ = EPISODE_LENGTH * 20  # every 1 episode
    checkpoint_callback = CheckpointCallback(save_freq=SAVE_FREQ,
                                             save_path=checkpoint_path,
                                             name_prefix='sac_callback_model')
    save_on_best_training_reward_callback = SaveOnBestTrainingRewardCallback(
        check_freq=SAVE_FREQ, log_dir=callback_path)
    callback = CallbackList(
        [checkpoint_callback, save_on_best_training_reward_callback])

    # traing got kill for some reason so continue from the checkpoint
    model_path = '/home/yliu2/rl_log/sac_mpc/ALT/3act/2/best_model.zip'
    model = SAC.load(model_path)
    model.set_env(env)

    print("---------- Start Learing -----------")
    model.learn(total_timesteps=TOTAL_TIMESTEPS,
                log_interval=SAVE_FREQ,
                callback=callback)

    print("---------- Finish Learning ----------")
    model.save(final_model_path)
    del model  # remove to demonstrate saving and loading
    model = SAC.load(final_model_path)

    results_plotter.plot_results([logdir], TOTAL_TIMESTEPS,
                                 results_plotter.X_TIMESTEPS, "SAC BLIMP")
    plt.show()
Ejemplo n.º 6
0
def main():
    env = gym.make("teaching-env-v0",
                   teacher_path=os.path.join(os.getcwd(), "../saved_models",
                                             sys.argv[1]),
                   validation_path=DATA_PATH,
                   max_queries=config.MAX_QUERIES)
    agent_model = SAC(MlpPolicy,
                      env,
                      train_freq=1,
                      batch_size=64,
                      learning_rate=3e-4,
                      learning_starts=0,
                      buffer_size=1000,
                      random_exploration=config.EPSILON_EXPLORATION,
                      gamma=config.GAMMA,
                      verbose=1)
    #agent_model.learn(total_timesteps=config.MAX_QUERIES * config.NUM_TRAIN_EPISODES)
    #agent_model.save('test_SAC')

    agent_model.load('test_SAC', env=env)

    obs = env.reset()

    total_reward = float('-inf')
    prog = tqdm(range(config.MAX_QUERIES), postfix={'Reward': total_reward})

    actions = []  # For visualization
    total_reward = 0.0
    for i in prog:
        action = select_action(agent_model,
                               obs,
                               epsilon=config.EPSILON_EXPLORATION)
        #action, _states = agent_model.predict(obs, deterministic=False)
        obs, reward, done, info = env.step(action)
        total_reward += reward
        prog.set_postfix({'Reward': total_reward})
        actions.append(np.asscalar(action))
    plt.hist(actions, bins=config.NUM_BINS, range=(-5, 5), density=True)
    plt.savefig('./visualizations/histograms/SAC')
    plt.clf()

    # Plot student's predicted function
    inputs = np.linspace(-5, 5, num=1000)
    outputs = env.student_model(inputs.reshape(-1, 1))
    plt.scatter(inputs, outputs, s=0.1, label='SAC')
    plt.title("SAC Student's Approximation")
    plt.ylim((-60, 100))
    plt.savefig('./visualizations/functions/SAC')
    plt.clf()
def rollout_policy(filename, traj_len, seed, env_name, n_trajs=1):
    model = SAC.load(filename)
    env = gym.make(env_name)
    env.seed(seed)

    trajs = []
    for _ in range(int(n_trajs)):
        obs_list, acts_list, rews_list = [], [], []
        obs = env.reset()
        obs_list.append(obs)
        for _ in range(traj_len):
            act = model.predict(obs, deterministic=True)[0]
            obs, r, done, _ = env.step(act)
            # assert not done
            acts_list.append(act)
            obs_list.append(obs)
            rews_list.append(r)

        infos = [{} for _ in range(traj_len)]
        traj = types.TrajectoryWithRew(
            obs=np.array(obs_list),
            acts=np.array(acts_list),
            infos=infos,
            rews=np.array(rews_list),
        )
        trajs.append(traj)

    return trajs
Ejemplo n.º 8
0
def get_new_weights():

    v_env = PortfolioEnv(settings['data_file'], settings['output_file'],
                         settings['strategy_name'], settings['total_steps'],
                         settings['window_length'], settings['capital_base'],
                         settings['lot_size'], settings['leverage'],
                         settings['commission_percent'],
                         settings['commission_fixed'],
                         settings['max_slippage_percent'],
                         settings['start_idx'], settings['compute_indicators'],
                         settings['compute_reward'],
                         settings['compute_position'], settings['debug'])
    #   Create the vectorized environment
    #   v_env = DummyVecEnv([lambda: v_env])
    #   Normalize environment
    #   v_env = VecNormalize(v_env, norm_obs=settings['norm_obs'], norm_reward=settings['norm_reward'], clip_obs=settings['clip_obs'], clip_reward=settings['clip_reward'], gamma=p_gamma, epsilon=EPS)

    model = SAC.load(MODELS_DIR + settings['model_name'])

    # Strategy

    obs = v_env.reset()
    dones = False

    while not dones:
        action, _states = model.predict(obs, deterministic=True)
        obs, rewards, dones, info = v_env.step(action)
        #   v_env.render(mode='ansi')

    weights = v_env.current_weights

    return weights
Ejemplo n.º 9
0
def load_model(path: str, env, desc: str):
    """ Loads a model from a stable baseline checkpoint file into a memory representation 

    Args:
        path        (str)           :       Path to the Stable Baseline Checkpoint File 
        env         (SB Env)        :       Path to the Stable Baseline Checkpoint File 
        desc        (str)           :       Text Description of what model this is

    Returns:
        The loaded model
    """

    if desc == "ddpg":
        return DDPG.load(path, env)
    elif desc == "ppo":
        env = DummyVecEnv([lambda: env])
        return PPO2.load(path, env)
    elif desc == "trpo":
        env = DummyVecEnv([lambda: env])
        return TRPO.load(path, env)
    elif desc == "td3":
        return TD3.load(path, env)
    elif desc == "sac":
        return SAC.load(path, env)
    else:
        raise RuntimeError(f"Model Name {desc} not supported")
Ejemplo n.º 10
0
def test_models(env):
    # seeds = [1, 2, 3]
    seeds = [1]

    for s in seeds:
        # Load Models
        # models = [A2C.load(f'data/models/a2c_{s}'),
        #           ACKTR.load(f'data/models/acktr_{s}'),
        #           DDPG.load(f'data/models/ddpg_{s}'),
        #           PPO2.load(f'data/models/ppo_{s}'),
        #           SAC.load(f'data/models/sac_{s}'),
        #           TD3.load(f'data/models/td3_{s}'),
        #           TRPO.load(f'data/models/trpo_{s}')]

        models = [PPO2.load(f'data/models/ppo_{s}'), SAC.load(f'data/models/sac_{s}'), TD3.load(
            f'data/models/td3_{s}'), TRPO.load(f'data/models/trpo_{s}')]

        for m in models:
            # run_policy(m, env)
            og_params = m.get_parameters()
            generalization_test(m, env)

            for i in range(50):
                params = prune_policy(m.__class__.__name__, og_params, 0.1)
                m.load_parameters(params)
                generalization_test(m, env)
Ejemplo n.º 11
0
def example():
    # This tutorial shows how to view policies of trained actors

    task = generate_task(task_generator_id='picking')
    world_params = dict()
    world_params["skip_frame"] = 3
    world_params["seed"] = 0
    stable_baselines_policy_path = "./model_2000000_steps.zip"
    model = SAC.load(stable_baselines_policy_path)

    # define a method for the policy fn of your trained model
    def policy_fn(obs):
        return model.predict(obs, deterministic=True)[0]

    # # Record a video of the policy is done in one line
    viewer.record_video_of_policy(task=task,
                                  world_params=world_params,
                                  policy_fn=policy_fn,
                                  file_name="pushing_video",
                                  number_of_resets=10,
                                  max_time_steps=10 * 100)

    # Similarly for interactive visualization in the GUI
    viewer.view_policy(task=task,
                       world_params=world_params,
                       policy_fn=policy_fn,
                       max_time_steps=40 * 600,
                       number_of_resets=40)
Ejemplo n.º 12
0
def evaluate_policy(policy_file, policy_type, envname, num_rollouts):
    if policy_type == "ppo":
        model = PPO2.load(policy_file)

        def get_action(obs):
            return model.predict(obs)[0]

    elif policy_type == "sac":
        model = SAC.load(policy_file)

        def get_action(obs):
            return model.predict(obs, deterministic=True)[0]

    else:
        raise NotImplementedError()

    env = gym.make(envname)

    returns = []
    for i in range(num_rollouts):
        # print("iter", i, end=" ")
        obs = env.reset()
        done = False
        totalr = 0.0
        while not done:
            action = get_action(obs)
            obs, r, done, _ = env.step(action)
            totalr += r
        returns.append(totalr)

    return np.mean(returns), np.std(returns)
Ejemplo n.º 13
0
def test_SAC(env, out_dir, seed=None, **kwargs):
    model = SAC.load(os.path.join(out_dir, 'final_model'), env=env)
    env.seed(seed)

    # Evaluate the trained agent
    mean_reward = evaluate(env, model, out_dir, num_episodes=20)

    return
Ejemplo n.º 14
0
def play():
    model = SAC.load(expDir + "/%s/%d" %
                     (name, np.format_float_scientific(nIter)))
    env = gym.make('PointMassDense-1-v1')
    while True:
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        env.render(mode='human')
Ejemplo n.º 15
0
def train_sac(training_tag):
    env = gym.make(ENVIRONMENT_NAME)
    env = DummyVecEnv([lambda: env])

    if (isinstance(training_tag, float)):
        model = SAC(sac_MlpPolicy,
                    env,
                    ent_coef=training_tag,
                    verbose=1,
                    policy_kwargs=POLICY_KWARGS)
        for step in range(TRAINING_STEPS):
            env.reset()

            (model, learning_results) = model.learn(
                total_timesteps=TRAINING_TIMESTEPS, log_interval=100)

            file_tag = str(training_tag).replace(".", "p")
            if (SAVE_AGENTS):
                model.save("nchain/models/SAC_" + ENVIRONMENT_NAME + "_s" +
                           str(step) + "_t" + str(file_tag) + "_i" +
                           str(CURRENT_ITERATION) + "_ts" +
                           str(TRAINING_TIMESTEPS))

        if (SAVE_FINAL_AGENT):
            model.save("nchain/models/SAC_" + ENVIRONMENT_NAME + "_t" +
                       str(file_tag) + "_i" + str(CURRENT_ITERATION) + "_ts" +
                       str(TRAINING_STEPS * TRAINING_TIMESTEPS))

        env.reset()
        del model

    return data
Ejemplo n.º 16
0
    def __init__(self, env):

        self.env = env

        load_path_rl="/home/icv/Trustworth/TiRL/models/sac-5"
        log_path_rl="/home/icv/Trustworth/TiRL/data/sac-5"
        
        self.model_rl = SAC.load(load_path_rl, env=env, tensorboard_log=log_path_rl)

        load_path_rule="/home/icv/Trustworth/TiRL/models/sac_rule3"
        log_path_rule="/home/icv/Trustworth/TiRL/data/sac_rule3"
        
        self.model_rule = SAC.load(load_path_rule, env=env, tensorboard_log=log_path_rule)
        self.agent_rule = IDM(env)

        print("load model successfully")

        self.reset()
Ejemplo n.º 17
0
def train_SAC(env, out_dir, seed=None, **kwargs):

    # Logs will be saved in log_dir/monitor.csv
    global output_dir
    output_dir = out_dir
    log_dir = os.path.join(out_dir, 'log')
    os.makedirs(log_dir, exist_ok=True)
    env = gym.make(env)
    env = Monitor(env, log_dir + '/', allow_early_resets=True)

    # Delete keys so the dict can be pass to the model constructor
    # policy = kwargs['policy']
    policy = 'MlpPolicy'
    # n_timesteps = kwargs['n_timesteps']
    n_timesteps = int(1e6)
    noise_type = None
    # Add some param noise for exploration
    param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1,
                                         desired_action_stddev=0.1)

    continue_model = False
    if continue_model is True:
        # Continue training
        print("Loading pretrained agent")
        model = SAC.load(os.path.join(out_dir, 'final_model.pkl'),
                         env=env,
                         tensorboard_log=os.path.join(log_dir, 'tb'),
                         verbose=1,
                         **kwargs)
    else:
        model = SAC(
            policy,
            env,  # action_noise=param_noise,
            verbose=1,
            tensorboard_log=os.path.join(log_dir, 'tb'),
            full_tensorboard_log=False,
            **kwargs)

    model.learn(total_timesteps=n_timesteps,
                seed=seed,
                callback=callback,
                log_interval=10)

    return model
Ejemplo n.º 18
0
def sac(env_id,
        timesteps,
        policy="MlpPolicy",
        log_interval=None,
        tensorboard_log=None,
        seed=None,
        load_weights=None):
    env = gym.make(env_id)

    if load_weights is not None:
        model = SAC.load(load_weights, env, verbose=0)
    else:
        model = SAC(policy, env, verbose=1, tensorboard_log=tensorboard_log)

    callback = WandbRenderEnvCallback(model_name="sac", env_name=env_id)

    model.learn(total_timesteps=timesteps,
                log_interval=log_interval,
                callback=callback)
Ejemplo n.º 19
0
def play(save_dir, env):
    model = SAC.load(save_dir + '/model_dir/sac/test_25_25_14_15',
                     env=env,
                     custom_objects=dict(learning_starts=0))  ### ADD NUM
    for _ in range(2):

        obs = env.reset()
        done = False
        while not done:
            action, _states = model.predict(obs)
            obs, reward, done, info = env.step(action)
Ejemplo n.º 20
0
    def f_checkpoints_range_2_mean_performance(
            self, checkpoints: range) -> Tuple[np.ndarray, np.ndarray]:
        logging.debug(
            f"[f_checkpoints_range_2_mean_performance]: checkpoints={checkpoints}"
        )
        rewards = np.zeros(len(checkpoints))
        s_rates = np.zeros(len(checkpoints))
        # Intent
        # - Iterate over this range, to load the associated Stable Baseline Model Checkpoint
        # - Pass that model to `mean_eval` evaluation function which will evaluate the model on
        #   - a certain number of episodes
        #   - a certain env
        #    - continuous or not continuous space
        # - an evaluation returns reward and average success rate
        #
        # Evaluating N checkpoints on M queries and then averaging on M so to finally have N Rewards and N Success Rates

        j = 0
        """ NOTE: i can range in anyway while j iterates over the numpy array 
        """
        for i in checkpoints:
            path = f"{self.args.training_base_path}/models/quadcopter-{i}{self.args.suffix}"
            logging.debug(f"Evaluating model at {path}")
            if self.args.model['name'] == "ddpg":
                model = DDPG.load(path)
            elif self.args.model['name'] == "ppo":
                model = PPO2.load(path)
            elif self.args.model['name'] == "trpo":
                model = TRPO.load(path)
            elif self.args.model['name'] == "td3":
                model = TD3.load(path)
            elif self.args.model['name'] == "sac":
                model = SAC.load(path)
            logging.debug(
                f"Evaluating Model {self.args.model['name']} for {self.args.n_episodes} episodes in {self.args.env} environment with continuous={str(self.args.continuous)}"
            )
            rewards_list, success_rates_list = mean_eval(
                num_episodes=self.args.n_episodes,
                checkpoint_id=i,
                model=model,
                env=self.env,
                v=True,
                continuous=self.args.continuous,
                plots_dir=self.args.plots_dir)
            rewards_mean = np.mean(rewards_list)
            success_rates_mean = np.mean(success_rates_list)
            logging.debug(
                f"Evaluation Checkpoint={i} --> Average Reward = {rewards_mean}, Average Success Rate = {success_rates_mean}"
            )
            rewards[j] = rewards_mean
            s_rates[j] = success_rates_mean
            j += 1
        return rewards, s_rates
Ejemplo n.º 21
0
def load_model(config):
    model = None
    if config["algo_name"] == "TD3":
        model = TD3.load("agents/{}".format(args["test_agent_path"]))
    if config["algo_name"] == "A2C":
        model = A2C.load("agents/{}".format(args["test_agent_path"]))
    if config["algo_name"] == "SAC":
        model = SAC.load("agents/{}".format(args["test_agent_path"]))
    if config["algo_name"] == "PPO2":
        model = PPO2.load("agents/{}".format(args["test_agent_path"]))
    assert model is not None, "Alg name not found, cannot load model, exiting. "
    return model
Ejemplo n.º 22
0
def func_run(env, logger, lr, action_noise, file):
    expDir = '/home/shivanik/lab/pointExp/state/'
    num_objs = 1

    verbose = 1
    name = 'sac_%d_0.5' % num_objs
    nIter = 5e7

    save_video_length = 200
    save_video_interval = 1000000
    env = VecVideoRecorder(
        env,
        osp.join(logger, "videos"),
        record_video_trigger=lambda x: x % save_video_interval == 0,
        video_length=save_video_length)
    model = SAC(
        MlpPolicy,
        env,
        verbose=verbose,
        tensorboard_log=logger,
        learning_rate=lr,
        action_noise=action_noise,
    )
    model.learn(total_timesteps=int(nIter), log_interval=100)
    exp_name = expDir + "/%s/%s_%s" % (name, np.format_float_scientific(nIter),
                                       np.format_float_scientific(lr))
    model.save(exp_name)
    file.write(exp_name + '\n')
    env.close()
    return True
Ejemplo n.º 23
0
def model_training_learning(env_train, model_name, timesteps=100000):

    # train model
    os.chdir("./model_saved/" + model_name)
    start = time.time()
    print("Train ", model_name, " Model with MlpPolicy: ")

    if model_name == "A2C_Model":
        model = A2C('MlpPolicy', env_train, verbose=0)
    elif model_name == "PPO_Model":
        model = PPO2('MlpPolicy', env_train, verbose=0)
    elif model_name == "TD3_Model":
        model = TD3('MlpPolicy', env_train, verbose=0)
    elif model_name == "SAC_Model":
        model = SAC('MlpPolicy', env_train, verbose=0)

    print("Learning ", model_name, " time steps: ", timesteps)

    model.learn(total_timesteps=timesteps)
    print("TD3 Model learning completed: ")
    end = time.time()
    timestamp = time.strftime('%b-%d-%Y_%H%M')
    model_file_name = (model_name + timestamp)
    model.save(model_file_name)
    print("- ", model_name, " save finish     :")
    print("Training time  ", model_name, " : ", (end - start) / 60, " minutes")

    os.chdir("./..")
    os.chdir("./..")
    return model
Ejemplo n.º 24
0
def main(argv):
    fixed = True

    policy_name = "sac_reaching_policy"

    obj_pose_rnd_std = 0 if fixed == True else 0.05
    pandaenv = pandaReachGymEnv(renders=True,
                                use_IK=0,
                                numControlledJoints=7,
                                obj_pose_rnd_std=obj_pose_rnd_std,
                                includeVelObs=True)
    n_actions = pandaenv.action_space.shape[-1]

    pandaenv = DummyVecEnv([lambda: pandaenv])

    model = SAC(MlpPolicy,
                pandaenv,
                gamma=0.9,
                batch_size=16,
                verbose=1,
                tensorboard_log="../pybullet_logs/pandareach_sac/")

    model.learn(total_timesteps=1000000)

    model.save("../pybullet_logs/pandareach_sac/" + policy_name)

    del model  # remove to demonstrate saving and loading
Ejemplo n.º 25
0
def run_experiment(verbose, tensorboard_log, learning_rate):
    pdb.set_trace()
    env = make_vec_env(
        'PointMassDense-%d-v1' % num_objs,
        1,
        wrapper_class=FlattenDictWrapper,
        wrapper_env_kwargs=['observation', 'achieved_goal', 'desired_goal'])
    env = VecVideoRecorder(
        env,
        osp.join(logger, "videos"),
        record_video_trigger=lambda x: x % save_video_interval == 0,
        video_length=save_video_length)

    n_actions = env.action_space.shape[-1]
    stddev = 0.2
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                     sigma=0.1 * np.ones(n_actions))

    model = SAC(
        MlpPolicy,
        env,
        verbose=verbose,
        tensorboard_log=logger,
        learning_rate=learning_rate,
        action_noise=action_noise,
    )
    model.learn(total_timesteps=int(nIter), log_interval=100)
    model.save(expDir + "/%s/%s_%s" %
               (name, np.format_float_scientific(nIter),
                np.format_float_scientific(learning_rate)))
    env.close()
Ejemplo n.º 26
0
 def explore(app,
             emulator,
             appium,
             timesteps,
             timer,
             save_policy,
             policy_dir,
             cycle,
             train_freq=5,
             target_update_interval=10):
     try:
         env = TimeFeatureWrapper(app)
         model = SAC(MlpPolicy,
                     env,
                     verbose=1,
                     train_freq=train_freq,
                     target_update_interval=target_update_interval)
         callback = TimerCallback(timer=timer, app=app)
         model.learn(total_timesteps=timesteps, callback=callback)
         if save_policy:
             model.save(f'{policy_dir}{os.sep}{cycle}')
         return True
     except Exception as e:
         print(e)
         appium.restart_appium()
         if emulator is not None:
             emulator.restart_emulator()
         return False
Ejemplo n.º 27
0
def load_model(model_path, params):
    env_cls = globals()[params['env']]
    orig_env = env_cls(**params['env_options'])
    env = DummyVecEnv([lambda: orig_env])

    if params['alg'] == 'PPO2':
        model = PPO2.load(model_path, env=env)
    elif params['alg'] == 'SAC':
        model = SAC.load(model_path, env=env)
    else:
        raise NotImplementedError

    return orig_env, model
Ejemplo n.º 28
0
def sac(env, seed):
    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=float(0.1) *
                                                np.ones(n_actions))

    return SAC('MlpPolicy',
               env,
               learning_rate=0.001,
               action_noise=action_noise,
               verbose=1,
               tensorboard_log="./data/runs",
               seed=seed)
Ejemplo n.º 29
0
def get_SAC_model(model_settings, model_path, ckpt_path, ckpt_step, tb_path):
    policy_kwargs = dict(layers=model_settings['NET_LAYERS'])
    env = get_single_process_env(model_settings, model_path, ckpt_step)
    if ckpt_path is not None:
        print("Loading model from checkpoint '{}'".format(ckpt_path))
        model = SAC.load(ckpt_path,
                         env=env,
                         _init_setup_model=True,
                         policy_kwargs=policy_kwargs,
                         **model_settings['train_configs'],
                         verbose=1,
                         tensorboard_log=tb_path)
        model.num_timesteps = ckpt_step
    else:
        model = SAC(SACMlpPolicy,
                    env,
                    _init_setup_model=True,
                    policy_kwargs=policy_kwargs,
                    **model_settings['train_configs'],
                    verbose=1,
                    tensorboard_log=tb_path)
    return model, env
Ejemplo n.º 30
0
    def load_model(self):
        model_path = "data/saved_models/"
        if folder:
            model_path = model_path + self.folder + "/"
        else:
            model_path = model_path + self.model_name + "/"

        model_path = model_path + self.model_name

        if self.episode:
            model_path = model_path + "_" + self.episode + ".pkl"

        self.model = SAC.load(model_path)