コード例 #1
0
 def explore(app,
             emulator,
             appium,
             timesteps,
             timer,
             save_policy,
             policy_dir,
             cycle,
             train_freq=5,
             target_update_interval=10):
     try:
         env = TimeFeatureWrapper(app)
         model = SAC(MlpPolicy,
                     env,
                     verbose=1,
                     train_freq=train_freq,
                     target_update_interval=target_update_interval)
         callback = TimerCallback(timer=timer, app=app)
         model.learn(total_timesteps=timesteps, callback=callback)
         if save_policy:
             model.save(f'{policy_dir}{os.sep}{cycle}')
         return True
     except Exception as e:
         print(e)
         appium.restart_appium()
         if emulator is not None:
             emulator.restart_emulator()
         return False
コード例 #2
0
def train_policy(num_of_envs, log_relative_path, maximum_episode_length,
                 skip_frame, seed_num, sac_config, total_time_steps,
                 validate_every_timesteps, task_name):
    task = generate_task(task_generator_id=task_name,
                         dense_reward_weights=np.array(
                             [250, 0, 125, 0, 750, 0, 0, 0.005]),
                         fractional_reward_weight=1,
                         goal_height=0.15,
                         tool_block_mass=0.02)
    env = CausalWorld(task=task,
                      skip_frame=skip_frame,
                      enable_visualization=False,
                      seed=seed_num,
                      max_episode_length=maximum_episode_length)
    set_global_seeds(seed_num)
    policy_kwargs = dict(layers=[256, 256])
    checkpoint_callback = CheckpointCallback(save_freq=int(
        validate_every_timesteps / num_of_envs),
                                             save_path=log_relative_path,
                                             name_prefix='model')
    model = SAC(MlpPolicy,
                env,
                verbose=1,
                policy_kwargs=policy_kwargs,
                **sac_config,
                seed=seed_num)
    model.learn(total_timesteps=total_time_steps,
                tb_log_name="sac",
                callback=checkpoint_callback)
    return
コード例 #3
0
def train_SAC(env, out_dir, seed=None, **kwargs):

    # Logs will be saved in log_dir/monitor.csv
    global output_dir
    output_dir = out_dir
    log_dir = os.path.join(out_dir, 'log')
    os.makedirs(log_dir, exist_ok=True)
    env = make_mujoco_env(env, 0)
    env = Monitor(env, log_dir + "/")

    continue_train = False
    if continue_train:
        # Continue training
        print("Loading pretrained agent")
        model = SAC.load(os.path.join(out_dir, 'final_model.pkl'),
                         env=env,
                         tensorboard_log=os.path.join(log_dir, 'tb'),
                         verbose=1,
                         **kwargs)
    else:
        model = SAC(
            policy,
            env,  #action_noise=action_noise,
            verbose=1,
            tensorboard_log=os.path.join(log_dir, 'tb'),
            full_tensorboard_log=False,
            **kwargs)

    model.learn(total_timesteps=n_timesteps,
                seed=seed,
                callback=callback,
                log_interval=10)

    return model
コード例 #4
0
ファイル: sac_func.py プロジェクト: shivanikishnani/codelab
def run_experiment(verbose, tensorboard_log, learning_rate):
    pdb.set_trace()
    env = make_vec_env(
        'PointMassDense-%d-v1' % num_objs,
        1,
        wrapper_class=FlattenDictWrapper,
        wrapper_env_kwargs=['observation', 'achieved_goal', 'desired_goal'])
    env = VecVideoRecorder(
        env,
        osp.join(logger, "videos"),
        record_video_trigger=lambda x: x % save_video_interval == 0,
        video_length=save_video_length)

    n_actions = env.action_space.shape[-1]
    stddev = 0.2
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                     sigma=0.1 * np.ones(n_actions))

    model = SAC(
        MlpPolicy,
        env,
        verbose=verbose,
        tensorboard_log=logger,
        learning_rate=learning_rate,
        action_noise=action_noise,
    )
    model.learn(total_timesteps=int(nIter), log_interval=100)
    model.save(expDir + "/%s/%s_%s" %
               (name, np.format_float_scientific(nIter),
                np.format_float_scientific(learning_rate)))
    env.close()
コード例 #5
0
def main(argv):
    fixed = True

    policy_name = "sac_reaching_policy"

    obj_pose_rnd_std = 0 if fixed == True else 0.05
    pandaenv = pandaReachGymEnv(renders=True,
                                use_IK=0,
                                numControlledJoints=7,
                                obj_pose_rnd_std=obj_pose_rnd_std,
                                includeVelObs=True)
    n_actions = pandaenv.action_space.shape[-1]

    pandaenv = DummyVecEnv([lambda: pandaenv])

    model = SAC(MlpPolicy,
                pandaenv,
                gamma=0.9,
                batch_size=16,
                verbose=1,
                tensorboard_log="../pybullet_logs/pandareach_sac/")

    model.learn(total_timesteps=1000000)

    model.save("../pybullet_logs/pandareach_sac/" + policy_name)

    del model  # remove to demonstrate saving and loading
コード例 #6
0
ファイル: model.py プロジェクト: Extracheesy/DailyTradingAC
def model_training_learning(env_train, model_name, timesteps=100000):

    # train model
    os.chdir("./model_saved/" + model_name)
    start = time.time()
    print("Train ", model_name, " Model with MlpPolicy: ")

    if model_name == "A2C_Model":
        model = A2C('MlpPolicy', env_train, verbose=0)
    elif model_name == "PPO_Model":
        model = PPO2('MlpPolicy', env_train, verbose=0)
    elif model_name == "TD3_Model":
        model = TD3('MlpPolicy', env_train, verbose=0)
    elif model_name == "SAC_Model":
        model = SAC('MlpPolicy', env_train, verbose=0)

    print("Learning ", model_name, " time steps: ", timesteps)

    model.learn(total_timesteps=timesteps)
    print("TD3 Model learning completed: ")
    end = time.time()
    timestamp = time.strftime('%b-%d-%Y_%H%M')
    model_file_name = (model_name + timestamp)
    model.save(model_file_name)
    print("- ", model_name, " save finish     :")
    print("Training time  ", model_name, " : ", (end - start) / 60, " minutes")

    os.chdir("./..")
    os.chdir("./..")
    return model
コード例 #7
0
def func_run(env, logger, lr, action_noise, file):
    expDir = '/home/shivanik/lab/pointExp/state/'
    num_objs = 1

    verbose = 1
    name = 'sac_%d_0.5' % num_objs
    nIter = 5e7

    save_video_length = 200
    save_video_interval = 1000000
    env = VecVideoRecorder(
        env,
        osp.join(logger, "videos"),
        record_video_trigger=lambda x: x % save_video_interval == 0,
        video_length=save_video_length)
    model = SAC(
        MlpPolicy,
        env,
        verbose=verbose,
        tensorboard_log=logger,
        learning_rate=lr,
        action_noise=action_noise,
    )
    model.learn(total_timesteps=int(nIter), log_interval=100)
    exp_name = expDir + "/%s/%s_%s" % (name, np.format_float_scientific(nIter),
                                       np.format_float_scientific(lr))
    model.save(exp_name)
    file.write(exp_name + '\n')
    env.close()
    return True
コード例 #8
0
def train():
    machine = StateMachine()
    machine.initialize(headless=True)
    camera = Camera(machine)
    env = CustomEnv(machine, camera, state="vision")
    model = SAC(CnnPolicy, env, verbose=1, learning_starts=32, batch_size=32, \
                target_update_interval=32, tensorboard_log=dir_path+'/Logs/')
    model.learn(total_timesteps=2000, log_interval=1000000)
    model.save("Grasp_Model_Full_Pose")
コード例 #9
0
def train(env_name,
          num_time_steps,
          policy_kwargs,
          eval_ep,
          eval_freq,
          ckpt_freq,
          load_model=None):
    env = gym.make(env_name)
    # env.render()
    env_ = gym.make(env_name)

    today = date.today()
    today = str(today).replace('-', '_')
    now = datetime.now()
    current_time = now.strftime("%H_%M_%S")
    model_name = env_name + '_SAC_' + today + current_time
    Path('./run/' + model_name).mkdir(parents=True, exist_ok=True)
    path = os.path.join(os.path.dirname(__file__), './run/' + model_name)
    env = Monitor(env, filename=path)
    ############################
    #          Logging         #
    ############################
    logger.configure(path)
    config = {}
    config['load'] = [{'load_model': load_model}]
    config['eval'] = [{'eval_freq': eval_freq, 'eval_ep': eval_ep}]
    config['ckpt'] = [{'ckpt_freq': ckpt_freq}]
    config['policy'] = [{'policy_network': policy_kwargs}]
    with open('./run/' + model_name + '/' + model_name + '.txt',
              'w+') as outfile:
        json.dump(config, outfile, indent=4)

    ############################
    #         callback         #
    ############################
    callbacklist = []
    ckpt_callback = CheckpointCallback(save_freq=ckpt_freq,
                                       save_path='./run/' + model_name +
                                       '/ckpt',
                                       name_prefix='')
    eval_callback = EvalCallback_wandb_SAC(env_,
                                           n_eval_episodes=eval_ep,
                                           eval_freq=eval_freq,
                                           log_path=path)
    callbacklist.append(ckpt_callback)
    callbacklist.append(eval_callback)
    callback = CallbackList(callbacklist)

    ############################
    #            run           #
    ############################
    # policy_kwargs = dict(net_arch=[128, dict(vf=[256], pi=[16])])
    model = SAC(MlpPolicy, env, verbose=1)
    model.learn(total_timesteps=int(num_time_steps),
                log_interval=20,
                callback=callback)
    model.save(path + "SAC_Walker2d")
コード例 #10
0
def train(learning_rate, time_steps, env, model_path):
    
    tf.reset_default_graph()    # to avoid the conflict the existnat parameters, but not suggested for reuse parameters


    # default policy is MlpPolicy
    model = SAC(CustomSACPolicy, env, verbose=1,seed=10, n_cpu_tf_sess=16)
    model.learn(total_timesteps=int(time_steps), log_interval=1000, callback=callback)
    model.save(model_path)
コード例 #11
0
def train_SAC(env_train, model_name, timesteps=50000):
    start = time.time()
    model = SAC('MlpPolicy', env_train, verbose=0)
    model.learn(total_timesteps=timesteps)
    end = time.time()

    model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}")
    print('Training time (SAC): ', (end - start) / 60, ' minutes')
    return model
コード例 #12
0
def sac(env_id,
        timesteps,
        policy="MlpPolicy",
        log_interval=None,
        tensorboard_log=None,
        seed=None):
    env = gym.make(env_id)

    model = SAC(policy, env, verbose=1, tensorboard_log=tensorboard_log)
    model.learn(total_timesteps=timesteps, log_interval=log_interval)

    save_model_weights(model, "sac", env_id, policy, seed)
コード例 #13
0
def train():
    set_gpu()
    expDir = '/home/shivanik/lab/pointExp/state/'
    num_objs = 1

    verbose = 1
    name = 'sac_%d_0.5' % num_objs
    nIter = 1e8

    save_video_length = 200
    save_video_interval = 1000000
    file = open('sac_done.txt', 'w+')
    env = make_vec_env(
        'PointMassDense-%d-v1' % num_objs,
        1,
        wrapper_class=FlattenDictWrapper,
        wrapper_env_kwargs=['observation', 'achieved_goal', 'desired_goal'])
    n_actions = env.action_space.shape[-1]
    stddev = 0.2

    pool = multiprocessing.Pool(processes=4)
    for lr in [1e-5]:  #, 5e-4, 1e-5
        logger = osp.join(
            expDir, name, 'logs%s_%s' % (np.format_float_scientific(nIter),
                                         np.format_float_scientific(lr)))
        env = VecVideoRecorder(
            env,
            osp.join(logger, "videos"),
            record_video_trigger=lambda x: x % save_video_interval == 0,
            video_length=save_video_length)
        action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                         sigma=0.1 * np.ones(n_actions))

        # boo = pool.apply_async(func_run, args=(env, logger, lr, action_noise, file))
        model = SAC(
            MlpPolicy,
            env,
            verbose=verbose,
            tensorboard_log=logger,
            learning_rate=lr,
            action_noise=action_noise,
        )
        model.learn(total_timesteps=int(nIter), log_interval=100)
        exp_name = expDir + "/%s/%s_%s" % (name,
                                           np.format_float_scientific(nIter),
                                           np.format_float_scientific(lr))
        model.save(exp_name)
        file.write(exp_name + '\n')
        env.close()
    file.close()
    pool.close()
    pool.join()
コード例 #14
0
ファイル: train.py プロジェクト: xc-jp/InsertionAI
def main():
    parser = argparse.ArgumentParser("Insertion, Manual mode")
    parser.add_argument('--host',
                        default="127.0.0.1",
                        type=str,
                        help='IP of the server')
    parser.add_argument(
        '--port',
        default=9081,
        type=int,
        help='Port that should be used to connect to the server')
    parser.add_argument('--save',
                        action="store_true",
                        help=('Saves checkpoints'))
    parser.add_argument(
        '--use_coord',
        action="store_true",
        help=('If set, the environment\'s observation space will be'
              'coordinates instead of images'))
    args = parser.parse_args()

    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

    env = gym.make('insertion-v0',
                   kwargs={
                       'host': args.host,
                       "port": args.port,
                       "use_coord": args.use_coord
                   })
    # check_env(env, warn=True)

    print(f"Observation space: {env.observation_space}")
    print(f"Action space: {env.action_space}")
    # print(env.action_space.sample())

    # Save a checkpoint every 50000 steps
    ckpt = CkptCallback(save_freq=50000,
                        save_path='../checkpoints/',
                        name_prefix='rl_insertion') if args.save else None

    if args.use_coord:
        model = SAC('MlpPolicy',
                    env,
                    verbose=1,
                    tensorboard_log="../insertion_tensorboard/")
    else:
        model = SAC('CnnPolicy',
                    env,
                    verbose=1,
                    tensorboard_log="../insertion_tensorboard/")

    model.learn(50001, callback=ckpt)
コード例 #15
0
def train_sac(training_tag):
    env = gym.make(ENVIRONMENT_NAME)
    env = DummyVecEnv([lambda: env])

    if (isinstance(training_tag, float)):
        model = SAC(sac_MlpPolicy,
                    env,
                    ent_coef=training_tag,
                    verbose=1,
                    policy_kwargs=POLICY_KWARGS)
        for step in range(TRAINING_STEPS):
            env.reset()

            (model, learning_results) = model.learn(
                total_timesteps=TRAINING_TIMESTEPS, log_interval=100)

            file_tag = str(training_tag).replace(".", "p")
            if (SAVE_AGENTS):
                model.save("nchain/models/SAC_" + ENVIRONMENT_NAME + "_s" +
                           str(step) + "_t" + str(file_tag) + "_i" +
                           str(CURRENT_ITERATION) + "_ts" +
                           str(TRAINING_TIMESTEPS))

        if (SAVE_FINAL_AGENT):
            model.save("nchain/models/SAC_" + ENVIRONMENT_NAME + "_t" +
                       str(file_tag) + "_i" + str(CURRENT_ITERATION) + "_ts" +
                       str(TRAINING_STEPS * TRAINING_TIMESTEPS))

        env.reset()
        del model

    return data
コード例 #16
0
def train_GAIL(env_train, model_name, timesteps=1000):
    """GAIL Model"""
    #from stable_baselines.gail import ExportDataset, generate_expert_traj
    start = time.time()
    # generate expert trajectories
    model = SAC('MLpPolicy', env_train, verbose=1)
    generate_expert_traj(model, 'expert_model_gail', n_timesteps=100, n_episodes=10)

    # Load dataset
    dataset = ExpertDataset(expert_path='expert_model_gail.npz', traj_limitation=10, verbose=1)
    model = GAIL('MLpPolicy', env_train, dataset, verbose=1)

    model.learn(total_timesteps=1000)
    end = time.time()

    model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}")
    print('Training time (PPO): ', (end - start) / 60, ' minutes')
    return model
コード例 #17
0
def train_SAC(env, out_dir, seed=None, **kwargs):

    # Logs will be saved in log_dir/monitor.csv
    global output_dir
    output_dir = out_dir
    log_dir = os.path.join(out_dir, 'log')
    os.makedirs(log_dir, exist_ok=True)
    env = gym.make(env)
    env = Monitor(env, log_dir + '/', allow_early_resets=True)

    # Delete keys so the dict can be pass to the model constructor
    # policy = kwargs['policy']
    policy = 'MlpPolicy'
    # n_timesteps = kwargs['n_timesteps']
    n_timesteps = int(1e6)
    noise_type = None
    # Add some param noise for exploration
    param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1,
                                         desired_action_stddev=0.1)

    continue_model = False
    if continue_model is True:
        # Continue training
        print("Loading pretrained agent")
        model = SAC.load(os.path.join(out_dir, 'final_model.pkl'),
                         env=env,
                         tensorboard_log=os.path.join(log_dir, 'tb'),
                         verbose=1,
                         **kwargs)
    else:
        model = SAC(
            policy,
            env,  # action_noise=param_noise,
            verbose=1,
            tensorboard_log=os.path.join(log_dir, 'tb'),
            full_tensorboard_log=False,
            **kwargs)

    model.learn(total_timesteps=n_timesteps,
                seed=seed,
                callback=callback,
                log_interval=10)

    return model
コード例 #18
0
ファイル: models.py プロジェクト: ioneliabuzatu/rl-benchmarks
def sac(env_id,
        timesteps,
        policy="MlpPolicy",
        log_interval=None,
        tensorboard_log=None,
        seed=None,
        load_weights=None):
    env = gym.make(env_id)

    if load_weights is not None:
        model = SAC.load(load_weights, env, verbose=0)
    else:
        model = SAC(policy, env, verbose=1, tensorboard_log=tensorboard_log)

    callback = WandbRenderEnvCallback(model_name="sac", env_name=env_id)

    model.learn(total_timesteps=timesteps,
                log_interval=log_interval,
                callback=callback)
コード例 #19
0
ファイル: model.py プロジェクト: Extracheesy/DailyTradingAC
def train_SAC(env_train, model_name, timesteps=100000):

    # train SAC model
    os.chdir("./model_saved/")
    start = time.time()
    print("Train SAC Model with MlpPolicy: ")

    model = SAC('MlpPolicy', env_train, verbose=0)
    print("SAC Learning time steps: ", timesteps)
    model.learn(total_timesteps=timesteps)
    print("SAC Model learning completed: ")

    end = time.time()
    timestamp = time.strftime('%b-%d-%Y_%H%M')
    model_file_name = (model_name + timestamp)
    model.save(model_file_name)
    print("SAC Model save finish     :")
    print('Training time SAC: ', (end - start) / 60, ' minutes')
    os.chdir("./..")

    return model
コード例 #20
0
    def train_SAC(self, model_name, model_params=config.SAC_PARAMS):
        """TD3 model"""
        from stable_baselines import SAC

        env_train = self.env

        start = time.time()
        model = SAC(
            'MlpPolicy',
            env_train,
            batch_size=model_params['batch_size'],
            buffer_size=model_params['buffer_size'],
            learning_rate=model_params['learning_rate'],
            learning_starts=model_params['learning_starts'],
            ent_coef=model_params['ent_coef'],
            verbose=model_params['verbose'],
            tensorboard_log=f"{config.TENSORBOARD_LOG_DIR}/{model_name}")
        model.learn(total_timesteps=model_params['timesteps'],
                    tb_log_name="SAC_run")
        end = time.time()

        model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}")
        print('Training time (SAC): ', (end - start) / 60, ' minutes')
        return model
コード例 #21
0
ファイル: agent.py プロジェクト: TylerJamesMalloy/bullet3
def test_agent(agent_step):
    now = time.time()
    for coef_index in range(len(CLAC_COEFS)):

        mut_coef = CLAC_COEFS[coef_index]
        ent_coef = SAC_COEFS[coef_index]
        training_timestep = 0

        clac_env = gym.make(ENVIRONMENT_NAME)
        clac_env = DummyVecEnv([lambda: clac_env])
        clac_model = CLAC(CLAC_MlpPolicy, clac_env, mut_inf_coef=mut_coef, verbose=1)

        sac_env = gym.make(ENVIRONMENT_NAME)
        sac_env = DummyVecEnv([lambda: sac_env])

        sac_model = SAC(MlpPolicy, sac_env, ent_coef=ent_coef, verbose=1)

        mirl_env = gym.make(ENVIRONMENT_NAME)
        mirl_env = DummyVecEnv([lambda: mirl_env])

        mirl_model = CLAC(CLAC_MlpPolicy, mirl_env, mut_inf_coef=mut_coef, coef_schedule=3.3e-3, verbose=1)
        
        for resample_step in range(0, NUM_RESAMPLES):
            features = pd.DataFrame()

            if(agent_step == 1):
                print(mut_coef,  "  ",  ent_coef, "  ", NUM_TRAINING_STEPS, "  ",  ENVIRONMENT_NAME, "  ", FOLDER, " ", resample_step)

            (clac_model, learning_results) = clac_model.learn(total_timesteps=NUM_TRAINING_STEPS, log_interval=1000)
            (sac_model, learning_results) = sac_model.learn(total_timesteps=NUM_TRAINING_STEPS, log_interval=1000)
            (mirl_model, learning_results) = mirl_model.learn(total_timesteps=NUM_TRAINING_STEPS, log_interval=1000)

            # Save models 
            clac_model.save(FOLDER + "/Training/models/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step))
            sac_model.save(FOLDER + "/Training/models/CLAC_" + str(ent_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step))
            mirl_model.save(FOLDER + "/Training/models/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step))

            training_timestep += NUM_TRAINING_STEPS

            # Test Normal 
            eval_results = eval_model(clac_model, clac_env, "CLAC", mut_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 0)
            eval_results.to_pickle(FOLDER + "/Training/results/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl")

            eval_results = eval_model(sac_model, sac_env, "SAC", ent_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 0)
            eval_results['AgentID'] = agent_step
            eval_results.to_pickle(FOLDER + "/Training/results/SAC_" + str(ent_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl")

            eval_results = eval_model(mirl_model, mirl_env, "MIRL", mut_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 0)
            eval_results['AgentID'] = agent_step
            eval_results.to_pickle(FOLDER + "/Training/results/MIRL_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl")

            # Test generalization 
            eval_results = eval_model(clac_model, clac_env, "CLAC", mut_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 1)
            eval_results['AgentID'] = agent_step
            eval_results.to_pickle(FOLDER + "/Generalization/results/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl")

            eval_results = eval_model(sac_model, sac_env, "SAC", ent_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 1)
            eval_results['AgentID'] = agent_step
            eval_results.to_pickle(FOLDER + "/Generalization/results/SAC_" + str(ent_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl")

            eval_results = eval_model(mirl_model, mirl_env, "MIRL", mut_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 1)
            eval_results['AgentID'] = agent_step
            eval_results.to_pickle(FOLDER + "/Generalization/results/MIRL_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl")

            # Test generalization Extreme
            eval_results = eval_model(clac_model, clac_env, "CLAC", mut_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 2)
            eval_results['AgentID'] = agent_step
            eval_results.to_pickle(FOLDER + "/Extreme/results/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl")

            eval_results = eval_model(sac_model, sac_env, "SAC", ent_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 2)
            eval_results['AgentID'] = agent_step
            eval_results.to_pickle(FOLDER + "/Extreme/results/SAC_" + str(ent_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl")

            eval_results = eval_model(mirl_model, mirl_env, "MIRL", mut_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 2)
            eval_results['AgentID'] = agent_step
            eval_results.to_pickle(FOLDER + "/Extreme/results/MIRL_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl")

            clac_env.env_method("reset_features")
            sac_env.env_method("reset_features")
            mirl_env.env_method("reset_features")
        
        del sac_model
        del sac_env

        del clac_model
        del clac_env
        
        del mirl_model
        del mirl_env

    later = time.time()
    difference = int(later - now)
    print("Tested Agent Time: ", difference)
コード例 #22
0
import gym
import rlbench.gym
from stable_baselines.sac.policies import MlpPolicy
from stable_baselines import SAC
import os
dir_path = os.path.dirname(os.path.realpath(__file__))

env = gym.make("empty_container-state-v0",render_mode="human",observation_mode='vision')
model = SAC(MlpPolicy, env, verbose=1, tensorboard_log=dir_path+'/Logs/')
model.learn(total_timesteps=1000)
model.save("sac_ec")
コード例 #23
0
import gym
import numpy as np
import imageio

from stable_baselines.sac.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import SAC

env = gym.make('Pendulum-v0')
env = DummyVecEnv([lambda: env])

model = SAC(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=1000, log_interval=10)

model.save("../models/sac_pendulum")

del model  # remove to demonstrate saving and loading

model = SAC.load("../models/sac_pendulum")

#obs = env.reset()
#while True:
#    action, _states = model.predict(obs)
#    obs, rewards, dones, info = env.step(action)
#    env.render()
コード例 #24
0
    step += 1
    if (dones):
        obs = env.reset()
        break
    #print(rewards)
    #env.render()

print("Initialization evaluation :{}, steps :{}".format(cum_reward, step))

## Evaluation complete of intialization

print("Starting Experiment with seed: {}".format(seed))

#model = PPO2(MlpPolicy, env,verbose=True)
model.learn(total_timesteps=1000000,
            use_action_repeat=False,
            poisson=False,
            callback=callback,
            only_explore_with_act_rep=False)
# f.close()
# json = json.dumps(log_data)
# f = open(log_dir+"log_data.json","w")
# f.write(json)
# f.close()
np.save(log_dir + "log_data.npy", log_data)

# Don't forget to save the VecNormalize statistics when saving the agent
# log_dir = "logs/hopper_aneal/"
# model.save(log_dir + "sac_hopper")
#env.save(os.path.join(log_dir, "vec_normalize.pkl"))
コード例 #25
0
class DriveAgent:
    """
    Python 3. The rest of the files are in Python 2.
    """
    def __init__(self):
        logger.info(os.getcwd())
        #self._interpreter = Interpreter("./converted_model.tflite")
        #self._interpreter.allocate_tensors()
        #print(self._interpreter.get_input_details())
        #print(self._interpreter.get_output_details())
        #_, self._input_height, self._input_width, _ = self._interpreter.get_input_details()[0]['shape']

        self.env = AutoDriftEnv(const_throttle=0.3)
        # self.model = SacModel(policy=CnnPolicy, env=self.env)
        self.model = SAC(policy=CnnPolicy, env=self.env)

        # self._input_height = IMAGE_HEIGHT
        # self._input_width = IMAGE_WIDTH
        # print(self._input_height)
        # print(self._input_width)

        # self._socket = socket.socket()
        # socket_addr = ('127.0.0.1', 8888)
        # UNCOMMENT THIS
        #self._socket.connect(socket_addr)

        self.main()

    def main(self):
        try:
            # Save a checkpoint every 1000 steps
            # https://github.com/hill-a/stable-baselines/blob/master/stable_baselines/common/callbacks.py
            checkpoint_callback = CheckpointCallback(save_freq=1000,
                                                     save_path='./logs',
                                                     name_prefix='rl_model',
                                                     verbose=2)
            self.model.learn(total_timesteps=5000,
                             log_interval=4,
                             callback=checkpoint_callback)

        # input_details = self._interpreter.get_input_details()
        # output_details = self._interpreter.get_output_details()
        # with picamera.PiCamera(resolution=(640, 480), framerate=30) as camera:
        #     # camera.vflip = True
        #     # camera.start_preview()
        #     try:
        #         stream = io.BytesIO()
        #         for _ in camera.capture_continuous(stream, format='jpeg', use_video_port=True):
        #             stream.seek(0)
        #             image = Image.open(stream).convert('RGB').resize((self._input_width, self._input_height), Image.ANTIALIAS)
        #             start_time = time.time()
        #
        #             img = np.asarray(image)
        #             img = img[np.newaxis, ...] # what's this for?
        #             input_data = np.array(img, dtype=np.float32)
        #             print(input_data.shape)
        #             print(input_data)
        #             #self._interpreter.set_tensor(input_details[0]['index'], input_data)
        #
        #             #self._interpreter.invoke()
        #             #output_data = self._interpreter.get_tensor(output_details[0]['index'])[0]
        #             # TEMP FIX
        #             output_data = None
        #             # time_taken_ms = (time.time() - start_time) * 1000
        #             # print(f'output_data:{output_data}, time_taken:{time_taken_ms}ms')
        #             # camera.annotate_text = str(output_data) + ", " + str(time_taken_ms)
        #             stream.seek(0)
        #             stream.truncate()
        #
        #             data = []
        #             data.append(output_data)
        #             data_string = pickle.dumps(data, protocol=1)
        #             self._socket.send(data_string)
        #
        except KeyboardInterrupt:
            print("DriveAgent: Ctrl-C")
        finally:
            # camera.stop_preview()
            self.env.close()
            print("DriveAgent: environment closed, done")
コード例 #26
0
import gym_reacher

from stable_baselines import SAC
from stable_baselines.common.callbacks import CheckpointCallback

# Save a checkpoint every 1000 steps

checkpoint_callback = CheckpointCallback(save_freq=1000,
                                         save_path='../results/tests/logs/',
                                         name_prefix='rl_model')

model = SAC('MlpPolicy', 'Reacher3Dof-v0')
model.learn(2000, callback=checkpoint_callback)
コード例 #27
0
def train_initial_policy(
        model_name,
        algo=ALGO,
        env_name=ENV_NAME,
        time_steps=TIME_STEPS):
    """Uses the specified algorithm on the target environment"""
    print("Using algorithm : ", algo.__name__)
    print("Model saved as : ", "data/models/" +algo.__name__+"_initial_policy_"+env_name+"_.pkl")

    # define the environment here
    env = gym.make(env_name)
    env.seed(SEED)
    if NOISE_VALUE>0 : env = NoisyRealEnv(env, noise_value=NOISE_VALUE)

    if MUJOCO_NORMALIZE:
        env = MujocoNormalized(env)

    print('~~ ENV Obs RANGE : ', env.observation_space.low, env.observation_space.high)
    print('~~~ ENV Action RANGE : ', env.action_space.low, env.action_space.high)

    if algo.__name__  == "ACKTR":
        print('Using SubprovVecEnv')
        env = SubprocVecEnv([lambda: env for i in range(8)])
    elif algo.__name__ == "SAC":
        print('Using standard gym environment')
        env = env
    else:
        print('Using Dummy Vec Env')
        env = DummyVecEnv([lambda : env])

    if NORMALIZE :
        env = VecNormalize(env,
                           training=True,
                           norm_obs=True,
                           norm_reward=False,
                           clip_reward=1e6,
                           )


    with open('data/target_policy_params.yaml') as file:
        args = yaml.load(file, Loader=yaml.FullLoader)
    args = args[algo.__name__][PARAMS_ENV]
    print('~~ Loaded args file ~~')

    if algo.__name__ == "SAC":
        print('Initializing SAC with RLBaselinesZoo hyperparameters .. ')
        print('using 256 node architecture as in the paper')

        class CustomPolicy(ffp_sac):
            def __init__(self, *args, **kwargs):
                super(CustomPolicy, self).__init__(*args, **kwargs,
                                                   feature_extraction="mlp", layers=[256, 256])

        model = SAC(CustomPolicy, env,
                    verbose=1,
                    tensorboard_log='data/TBlogs/initial_policy_training',
                    batch_size=args['batch_size'],
                    buffer_size=args['buffer_size'],
                    ent_coef=args['ent_coef'],
                    learning_starts=args['learning_starts'],
                    learning_rate=args['learning_rate'],
                    train_freq=args['train_freq'],
                    seed=SEED,
                    )
    elif algo.__name__ == "TD3":
        print('Initializing TD3 with RLBaselinesZoo hyperparameters .. ')
        # hyperparameters suggestions from :
        # https://github.com/araffin/rl-baselines-zoo/blob/master/trained_agents/td3/HopperBulletEnv-v0/config.yml
        n_actions = env.action_space.shape[-1]
        action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                         sigma=float(args['noise_std']) * np.ones(n_actions))
        class CustomPolicy2(ffp_td3):
            def __init__(self, *args, **kwargs):
                super(CustomPolicy2, self).__init__(*args, **kwargs,
                                                   feature_extraction="mlp", layers=[400, 300])
        model = TD3(CustomPolicy2, env,
                    verbose = 1,
                    tensorboard_log = 'data/TBlogs/initial_policy_training',
                    batch_size = args['batch_size'],
                    buffer_size = args['buffer_size'],
                    gamma = args['gamma'],
                    gradient_steps = args['gradient_steps'],
                    learning_rate = args['learning_rate'],
                    learning_starts = args['learning_starts'],
                    action_noise = action_noise,
                    train_freq=args['train_freq'],
                    seed=SEED,
                    )

    elif algo.__name__ == "TRPO":
        print('Initializing TRPO with RLBaselinesZoo hyperparameters .. ')
        # hyperparameters suggestions from :
        # https://github.com/araffin/rl-baselines-zoo/blob/master/trained_agents/sac/HopperBulletEnv-v0/config.yml
        model = TRPO(mlp_standard, env,
                    verbose=1,
                    tensorboard_log='data/TBlogs/initial_policy_training',
                    timesteps_per_batch=args['timesteps_per_batch'],
                    lam=args['lam'],
                    max_kl=args['max_kl'],
                    gamma=args['gamma'],
                    vf_iters=args['vf_iters'],
                    vf_stepsize=args['vf_stepsize'],
                    entcoeff=args['entcoeff'],
                    cg_damping=args['cg_damping'],
                    cg_iters=args['cg_iters'],
                     seed=SEED,
                    )

    elif algo.__name__ == "ACKTR":
        print('Initializing ACKTR')
        model = ACKTR(mlp_standard,
                      env,
                      verbose=1,
                      n_steps=128,
                      ent_coef=0.01,
                      lr_schedule='constant',
                      learning_rate=0.0217,
                      max_grad_norm=0.5,
                      gamma=0.99,
                      vf_coef=0.946,
                      seed=SEED)

    elif algo.__name__ == "PPO2":
        print('Initializing PPO2')
        print('Num envs : ', env.num_envs)
        model = PPO2(mlp_standard,
                     env,
                     n_steps=int(args['n_steps']/env.num_envs),
                     nminibatches=args['nminibatches'],
                     lam=args['lam'],
                     gamma=args['gamma'],
                     ent_coef=args['ent_coef'],
                     noptepochs=args['noptepochs'],
                     learning_rate=args['learning_rate'],
                     cliprange=args['cliprange'],
                     verbose=1,
                     tensorboard_log='data/TBlogs/initial_policy_training',
                     seed=SEED,
                     )

    else:
        print('No algorithm matched. Using SAC .. ')
        model = SAC(CustomPolicy, env,
                    verbose=1,
                    batch_size=args['batch_size'],
                    buffer_size=args['buffer_size'],
                    ent_coef=args['ent_coef'],
                    learning_starts=args['learning_starts'],
                    learning_rate=args['learning_rate'],
                    train_freq=args['train_freq'],
                    seed=SEED,
                    )

    # change model name if using normalization
    if NORMALIZE:
        model_name = model_name.replace('.pkl', 'normalized_.pkl')

    elif MUJOCO_NORMALIZE:
        model_name = model_name.replace('.pkl', 'mujoco_norm_.pkl')

    if SAVE_BEST_FOR_20:
        model.learn(total_timesteps=time_steps,
                    tb_log_name=model_name,
                    log_interval=10,
                    callback=eval_callback)
        save_the_model()
        model_name = model_name.replace('best_', '')
        model.save(model_name)
    elif SAVE_INTERMEDIATE:
        check_callback = CheckpointCallback(save_freq=SAVE_FREQ,
                                            save_path=model_name[:-4],
                                            name_prefix=ENV_NAME + '_' + str(SEED),
                                            verbose=1,
                                            )
        eval_env = DummyVecEnv([lambda: gym.make(ENV_NAME)])
        eval_env.seed(SEED)
        eval_callback = EvalCallback(eval_env,
                                     n_eval_episodes=10,
                                     eval_freq=SAVE_FREQ,
                                     log_path=model_name[:-4],
                                     deterministic=False,
                                     render=False,
                                     verbose=1)

        callbacks = CallbackList([check_callback, eval_callback])
        model.learn(total_timesteps=time_steps,
                    tb_log_name=model_name.split('/')[-1],
                    log_interval=10,
                    callback=callbacks)
        model.save(model_name)
        npzfile = np.load(model_name[:-4] + '/evaluations.npz')
        average_rewards = np.mean(npzfile['results'], axis=1)[:, 0]
        with open(model_name[:-4] + "/eval_results.txt", "a") as f:
            for i in range(np.shape(average_rewards)[0]):
                f.write("{}, {}\n".format(npzfile['timesteps'][i], average_rewards[i]))
        evaluate_policy_on_env(env, model, render=False, iters=50)
    else:
        model.learn(total_timesteps=time_steps,
                    tb_log_name=model_name.split('/')[-1],
                    log_interval=10,)
        model.save(model_name)
        evaluate_policy_on_env(env, model, render=False, iters=50)

    # save the environment params
    if NORMALIZE:
        # env.save(model_name.replace('.pkl', 'stats_.pkl'))
        env.save('data/models/env_stats/'+env_name+'.pkl')

    print('done :: ', model_name)
    exit()
コード例 #28
0
# env = VecNormalize(env, norm_obs=True, norm_reward=False,
#                            clip_obs=10.)

env = Monitor(env.envs[0], log_dir, allow_early_resets=True)

#env.act_rep = 20

# print(env.observation_space)

low = np.full((12, ), -float('inf'))
high = np.full((12, ), float('inf'))
space = spaces.Box(low, high, dtype=low.dtype)
env.observation_space = space
print(env.observation_space)
# exit()

model = SAC(MlpPolicy, env, verbose=1)
#model = PPO2(MlpPolicy, env,verbose=True)
model.learn(total_timesteps=1000000, poisson=True, callback=callback)
f.close()
# json = json.dumps(log_data)
# f = open(log_dir+"log_data.json","w")
# f.write(json)
# f.close()
np.save(log_dir + "log_data.npy", log_data)

# Don't forget to save the VecNormalize statistics when saving the agent
# log_dir = "logs/hopper_aneal/"
# model.save(log_dir + "sac_hopper")
#env.save(os.path.join(log_dir, "vec_normalize.pkl"))
コード例 #29
0
                 verbose=1,
                 tensorboard_log=tensorboard_log_dir)
    if train:
        for i in range(model_num):
            model.learn(total_timesteps=total_timesteps_,
                        tb_log_name=tensorboard_log_name)
            model.save(model_save_name)
elif algorithm == "SAC":
    from stable_baselines.sac.policies import MlpPolicy
    from stable_baselines import SAC
    env = gym.make(env_name)
    model = SAC(MlpPolicy, env, verbose=1, tensorboard_log=tensorboard_log_dir)
    if train:
        for i in range(model_num):
            model.learn(total_timesteps=total_timesteps_,
                        log_interval=1,
                        tb_log_name=tensorboard_log_name)
            model.save(model_save_name)
elif algorithm == "DDPG":
    if train:
        for i in range(model_num):
            from stable_baselines.ddpg.policies import MlpPolicy
            from stable_baselines.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise, AdaptiveParamNoiseSpec
            from stable_baselines import DDPG
            env = gym.make(env_name)

            # the noise objects for DDPG
            n_actions = env.action_space.shape[-1]
            param_noise = None
            action_noise = OrnsteinUhlenbeckActionNoise(
                mean=np.zeros(n_actions),
コード例 #30
0
def callback(_locals, _globals):
   global best_mean_reward, n_steps
   mean_reward = float("-inf")
   if n_steps % 1000 == 0 and n_steps != 0:
      x, y = ts2xy(load_results(log_dir), 'timesteps')
      if len(x) > 0:
         mean_reward = np.mean(y[-100:])
         print(x[-1], 'timesteps')
         print("Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(best_mean_reward, mean_reward))
   if mean_reward > best_mean_reward:
      best_mean_reward = mean_reward
      print("Saving new best model")
      _locals['self'].save(log_dir + 'best_model.pkl')
   n_steps += 1
   return True

log_dir = "tmp/"
os.makedirs(log_dir, exist_ok=True)
env = Monitor(env, log_dir, allow_early_resets=True)
time_steps=1e6
model = SAC(MlpPolicy, env, verbose=0)
model.learn(total_timesteps=int(time_steps), callback=callback)
#results_plotter.plot_results([log_dir], time_steps, results_plotter.X_TIMESTEPS, "Hopper")
x, y = ts2xy(load_results(log_dir), "timesteps")
#results_plotter.plot_results([log_dir], time_steps, y, "Hopper")
plt.plot(x,y)
#plt.show()
plt.savefig("hopper_sac_default")