Beispiel #1
0
    log_path = "{}/{}/".format(args.log_folder, args.algo)
    save_path = os.path.join(
        log_path, "{}_{}{}".format(env_id,
                                   get_latest_run_id(log_path, env_id) + 1,
                                   uuid_str))
    params_path = "{}/{}".format(save_path, env_id)
    os.makedirs(params_path, exist_ok=True)

    callbacks = []
    if args.save_freq > 0:
        # Account for the number of parallel environments
        args.save_freq = max(args.save_freq // n_envs, 1)
        callbacks.append(
            CheckpointCallback(save_freq=args.save_freq,
                               save_path=save_path,
                               name_prefix='rl_model',
                               verbose=1))

    def create_env(n_envs, eval_env=False):
        """
        Create the environment and wrap it if necessary
        :param n_envs: (int)
        :param eval_env: (bool) Whether is it an environment used for evaluation or not
        :return: (Union[gym.Env, VecEnv])
        :return: (gym.Env)
        """
        global hyperparams

        # Do not log eval env (issue with writing the same file)
        log_dir = None if eval_env else save_path
Beispiel #2
0
e_c = 0.0001  #define entropy coeff
feedback = 'Bayes'  #'Markov' or 'Bayes'
steady = True  #if True resets always with steady state conditions
N = 8  #number of parallel workers
LRo = 2.5e-4  #learning rate
#uact=True #if we want to use u as action (only Bayesian)
TIMESTEPS = int(50e6)  #training steps
sched_LR = LinearSchedule(1, LRo, 0)  #lr schedule
LR = sched_LR.value
qs = 0  #no feedback cost
dirname = 'Fisher_tests_{}RK4_cirand'.format(feedback)  #directory name
title = 'feed{}_steady{}_lro{}_ts{}M_N{}_ec{}_0.49_3e4_theta0.1_Mlp_1e-3_RK4_bothrand_s2'.format(
    feedback, steady, LRo, TIMESTEPS / 1e6, N, e_c)
#make checkpoint callback
checkpoint_callback = CheckpointCallback(
    save_freq=int(100000 / N),
    save_path='./Fisher_nocost_checkpoint/{}/{}_q{}'.format(
        dirname, title, qs))
callback = checkpoint_callback
#set parameters and start training
params = {
    'k': 1,
    'eta': 1,
    'X_kunit': 0.49,
    'theta': 0.1
}  #if a parameter is set to None it will be sampled from a uniform distribution at every reset
args = {
    'feedback': feedback,
    'params': params
}  #i parametri di default son questi: rewfunc=Tools.purity_like_rew,q=1e-4,dt=1e-3,plot=False,pow=0.5
#instantiate environment
env = make_vec_env(FisherEnv, n_envs=N, env_kwargs=args)
Beispiel #3
0
steady = True  #if True resets always with steady state conditions
plot = False  #if True resets always to fixed out of equilibrium conditions
N = 1  #number of parallel workers
LRo = 2e-4  #define the learning rate
TIMESTEPS = int(6e6)  #training steps
sched_LR = LinearSchedule(1, LRo, 0)  #schedule for lr reduction
LR = sched_LR.value
clip = LinearSchedule(1, 0.2,
                      0).value  #schedule for clipping parameter PPO (eventual)

title = 'feed{}_steady{}_lro{}_ts{}M_N{}_ec{}_{}_{}_{}_partial{}_fbound{}_tanh0.01_pur0.5_hurwseedr0_1e5'.format(
    feedback, steady, LRo, TIMESTEPS / 1e6, N, e_c, k, mirr, g, partial,
    fbound)
#make checkpoint callback
checkpoint_callback = CheckpointCallback(
    save_freq=int(1000000 / N),
    save_path='/home/fallani/prova/New/Optomech_checkpoint/{}/{}_q{}'.format(
        dirname, title, qs))
callback = checkpoint_callback
#set F matrix
zero = np.zeros((2, 2))
if fbound == True:
    F = np.block([[zero, zero], [zero, np.identity(2)]])  #custom F matrix
elif fbound == False:
    F = np.identity(4)
P = np.block([[np.identity(2), zero], [zero, zero]])
#set parameters and start training
params = par.parameters(k=k, mirr=mirr, g=g)
#commented parameters from Hammerer, usually useless
#params={'wm':1,'k':0.5,'y':2e-7,'eta':1,'g':0.3,'detuning':-1,'ne':3.5e5,'na':0,'phi':math.pi/2}#{'wm':1,'k':5,'y':1.14e-4,'eta':1,'g':0.095,'detuning':0,'ne':2,'na':0,'phi':math.pi*0.25} #if a parameter is set to None it will be sampled from a uniform distribution at every reset
args = {
    'feedback': feedback,
Beispiel #4
0
from stable_baselines import PPO2
import warnings
warnings.filterwarnings('ignore')

envArgsDict = {
    'resizeCamImagePct': 50, 'ledHSVLower': np.array([0, 0, 252]), 'ledHSVHigher':np.array([31, 9, 255]),
    'rPiIP': '192.168.0.183', 'rPiPort':50000, 'episodeLength':100, 'bullseye':10
}

env = make_vec_env(RPiLEDEnv, n_envs=1, env_kwargs=envArgsDict)

callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-500, verbose=1)

eval_callback = EvalCallback(env, best_model_save_path='./logs/best',
                             log_path='./logs/', eval_freq=500,
                             deterministic=True, render=False, callback_on_new_best=callback_on_best)

# Added checkpoint because I lost model data after a crash when the webcam shutdown because the screen went to sleep :(
checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/',
                                         name_prefix='ppo2_model')

cb = CallbackList([checkpoint_callback, eval_callback])

policy_kwargs = {'layers':[128, 128]}

model = PPO2.load('/Users/guillaumevandecasteele/PycharmProjects/robotics/ppo1_rpi_led_nn128.zip', verbose=1, policy_kwargs=policy_kwargs, tensorboard_log='./logs/')
model.set_env(env)
model.learn(total_timesteps=20000, callback=cb)
model.save("ppo2_rpi_led_pargs")

Beispiel #5
0
def run(alg,
        alg_kwargs,
        task,
        task_kwargs,
        wrappers_kwargs,
        expl_params,
        rollout,
        num_trials,
        folder,
        n_thrds,
        n_lstm,
        rerun=False,
        test_kwargs={},
        num_retrains=10,
        seed=0,
        train_mode=None,
        sl_kwargs=None):
    train_mode = train_mode or 'RL'
    env = test_env(task, kwargs=task_kwargs, num_steps=1000)
    num_timesteps = int(1000 * num_trials / (env.num_tr))
    files = glob.glob(folder + '/*model*')
    vars_ = {
        'alg': alg,
        'alg_kwargs': alg_kwargs,
        'task': task,
        'task_kwargs': task_kwargs,
        'wrappers_kwargs': wrappers_kwargs,
        'expl_params': expl_params,
        'rollout': rollout,
        'folder': folder,
        'num_trials': num_trials,
        'n_thrds': n_thrds,
        'n_lstm': n_lstm
    }
    np.savez(folder + '/params.npz', **vars_)
    if len(files) == 0 or rerun:
        if train_mode == 'RL':
            if alg == "A2C":
                from stable_baselines import A2C as algo
            elif alg == "ACER":
                from stable_baselines import ACER as algo
            elif alg == "ACKTR":
                from stable_baselines import ACKTR as algo
            elif alg == "PPO2":
                from stable_baselines import PPO2 as algo
            env = SubprocVecEnv([
                make_env(env_id=task,
                         rank=i,
                         seed=seed,
                         wrapps=wrappers_kwargs,
                         **task_kwargs) for i in range(n_thrds)
            ])
            model = algo(LstmPolicy,
                         env,
                         verbose=0,
                         n_steps=rollout,
                         n_cpu_tf_sess=n_thrds,
                         tensorboard_log=None,
                         policy_kwargs={
                             "feature_extraction": "mlp",
                             "n_lstm": n_lstm
                         },
                         **alg_kwargs)
            # this assumes 1 trial ~ 10 steps
            sv_freq = 5 * wrappers_kwargs['MonitorExtended-v0']['sv_per']
            chckpnt_cllbck = CheckpointCallback(save_freq=sv_freq,
                                                save_path=folder,
                                                name_prefix='model')
            model.learn(total_timesteps=num_timesteps, callback=chckpnt_cllbck)
            model.save(f"{folder}/model_{num_timesteps}_steps.zip")
            plotting.plot_rew_across_training(folder=folder)
        elif train_mode == 'SL':
            stps_ep = sl_kwargs['steps_per_epoch']
            wraps_sl = deepc(wrappers_kwargs)
            del wraps_sl['PassAction-v0']
            del wraps_sl['PassReward-v0']
            del wraps_sl['MonitorExtended-v0']
            env = make_env(env_id=task,
                           rank=0,
                           seed=seed,
                           wrapps=wraps_sl,
                           **task_kwargs)()
            dataset = ngym.Dataset(env,
                                   batch_size=sl_kwargs['btch_s'],
                                   seq_len=rollout,
                                   batch_first=True)
            obs_size = env.observation_space.shape[0]
            act_size = env.action_space.n
            model = define_model(seq_len=rollout,
                                 num_h=n_lstm,
                                 obs_size=obs_size,
                                 act_size=act_size,
                                 batch_size=sl_kwargs['btch_s'],
                                 stateful=sl_kwargs['stateful'],
                                 loss=sl_kwargs['loss'])
            # Train network
            data_generator = (dataset() for i in range(stps_ep))
            model.fit(data_generator, verbose=1, steps_per_epoch=stps_ep)
            model.save(f"{folder}/model_{stps_ep}_steps")

    if len(test_kwargs) != 0:
        for key in test_kwargs.keys():
            sv_folder = folder + key
            test_kwargs[key]['seed'] = seed
            if train_mode == 'RL':
                ga.get_activity(folder, alg, sv_folder, **test_kwargs[key])
            elif train_mode == 'SL':
                stps_ep = sl_kwargs['steps_per_epoch']
                wraps_sl = deepc(wrappers_kwargs)
                wraps_sl.update(test_kwargs[key]['wrappers'])
                del wraps_sl['PassAction-v0']
                del wraps_sl['PassReward-v0']
                env = make_env(env_id=task,
                               rank=0,
                               seed=seed,
                               wrapps=wraps_sl,
                               **task_kwargs)()
                obs_size = env.observation_space.shape[0]
                act_size = env.action_space.n
                model_test = define_model(seq_len=1,
                                          batch_size=1,
                                          obs_size=obs_size,
                                          act_size=act_size,
                                          stateful=sl_kwargs['stateful'],
                                          num_h=n_lstm,
                                          loss=sl_kwargs['loss'])
                ld_f = folder + 'model_' + str(stps_ep) + '_steps'.replace(
                    '//', '/')
                model_test.load_weights(ld_f)
                env.reset()
                for ind_stp in range(sl_kwargs['test_steps']):
                    obs = env.ob_now
                    obs = obs[np.newaxis]
                    obs = obs[np.newaxis]
                    action = model_test.predict(obs)
                    action = np.argmax(action, axis=-1)[0]
                    _, _, _, _ = env.step(action)
def main():
    """ Prepare for trainings """
    log_dir, model_dir = prepare_dirs()

    model_name = model_dir + '/' + MODEL_NAME
    print(f'model will be saved as {model_name}')

    log_dir = log_dir + '/' + MODEL_NAME
    """ Generate & Check environment """
    env_name = ENV_NAME
    env = gym.make(env_name)
    # print(f'Observation space: {env.observation_space}')
    # print(f'Action space: {env.action_space}')
    # env = Monitor(env, log_dir, allow_early_resets=True)
    # check_env(env)
    """ Save config as pickle file """
    config = summarize_config(env)
    save_config(log_dir, config)
    """ Vectorize environment """
    num_envs = NUM_ENVS
    env = DummyVecEnv([lambda: env for _ in range(num_envs)])  # For training

    eval_env = DummyVecEnv([lambda: gym.make(env_name)])  # For evaluation
    """ Define checkpoint callback """
    checkpoint_callback = CheckpointCallback(save_freq=SAVE_FREQ,
                                             save_path=model_name,
                                             name_prefix=MODEL_NAME)
    """ Use deterministic actions for evaluation callback """
    eval_callback = EvalCallback(eval_env,
                                 best_model_save_path=model_name,
                                 log_path=log_dir,
                                 eval_freq=EVAL_FREQ,
                                 deterministic=True,
                                 render=False,
                                 n_eval_episodes=N_EVAL_EPISODES)

    print(f'Algorithm: {ALGORITHM}\n')

    if not CONTINUAL_LEARNING:
        """ Define model """
        model = define_model(env, log_dir)
    else:
        model = load_model(env, model_dir, log_dir)
    """ Evaluate model before training """
    # mean_reward, std_reward = evaluate_policy(model=model,
    #                                          env=eval_env,
    #                                          n_eval_episodes=N_EVAL_EPISODES)
    # print(f'Before training: mean reward: {mean_reward:.2f} +/- {std_reward:.2f}')
    """ Train model """
    model.learn(total_timesteps=MAX_STEPS,
                callback=[checkpoint_callback, eval_callback])
    """ Evaluate model after training """
    # mean_reward, std_reward = evaluate_policy(model=model,
    #                                          env=eval_env,
    #                                          n_eval_episodes=N_EVAL_EPISODES)
    # print(f'After training: mean reward: {mean_reward:.2f} +/- {std_reward:.2f}')
    """ Save trained model """
    model.save(model_name)
    """ Test trained model """
    obs = eval_env.reset()
    for i in range(N_EVAL_EPISODES):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = eval_env.step(action)
        eval_env.render()

    env.close()
    eval_env.close()
from stable_baselines import PPO2
import numpy as np
import gym
from stable_baselines.common.callbacks import CheckpointCallback
from utils import *

gamename = "MortalKombat3-Genesis"

if __name__ == "__main__":
    n_cpu = 16

    env = SubprocVecEnv([make_env] * n_cpu)
    env = VecFrameStack(env, n_stack=4)

    model = PPO2(CnnLstmPolicy,
                 env,
                 n_steps=128,
                 verbose=1,
                 tensorboard_log="./tboard_log")
    # Use this if you want to continue training a saved model
    # model = PPO2.load("training_checkpoints/your_model.zip", tensorboard_log="./tboard_log")
    # model.set_env(env)

    checkpoint_callback = CheckpointCallback(
        save_freq=1000,
        save_path='./training_checkpoints',
        name_prefix='subzero-ppo2')
    model.learn(total_timesteps=20000000, callback=checkpoint_callback)
    model.save('subzero-ppo2')
    env.close()
Beispiel #8
0
def main(logdir):
    # params
    SLEEP_RATE = 100  #1 2 10 50 100Hz
    EPISODE_TIME = 30  # 30 120 sec
    USE_MPC = False
    N_EPISODE = 1000000
    Action_Choice = np.array([1, 1, 1, 1, 0, 0, 0, 0])
    EPISODE_LENGTH = SLEEP_RATE * EPISODE_TIME
    TOTAL_TIMESTEPS = EPISODE_LENGTH * N_EPISODE

    # logdir
    logdir = os.path.join(logdir, strftime("%Y-%m-%d--%H:%M:%S", localtime()))
    os.makedirs(logdir)
    checkpoint_path = os.path.join(logdir, 'checkpoint')
    callback_path = logdir
    final_model_path = logdir + '/final_model'

    # env
    env = BlimpEnv(SLEEP_RATE, EPISODE_TIME, USE_MPC, Action_Choice)
    env = Monitor(env, logdir)
    # env = make_vec_env(lambda: env, n_envs=1, monitor_dir=logdir)
    print("Observation space:", env.observation_space)
    print("Shape:", env.observation_space.shape)
    print("Action space:", env.action_space)

    # callback
    SAVE_FREQ = EPISODE_LENGTH * 100  # save model for every 20 episode
    checkpoint_callback = CheckpointCallback(save_freq=SAVE_FREQ,
                                             save_path=checkpoint_path,
                                             name_prefix='sac_callback_model')
    save_on_best_training_reward_callback = SaveOnBestTrainingRewardCallback(
        check_freq=SAVE_FREQ, log_dir=callback_path)
    callback = CallbackList(
        [checkpoint_callback, save_on_best_training_reward_callback])

    # agent
    model = SAC(MlpPolicy,
                env,
                gamma=0.98,
                learning_rate=0.0003,
                buffer_size=1000000,
                learning_starts=EPISODE_LENGTH * 20,
                train_freq=1,
                batch_size=256,
                tau=0.01,
                ent_coef='auto',
                target_update_interval=1,
                gradient_steps=1,
                target_entropy='auto',
                action_noise=None,
                verbose=1,
                tensorboard_log=logdir,
                full_tensorboard_log=True,
                _init_setup_model=True)

    print("---------- Start Learing -----------")
    model.learn(total_timesteps=TOTAL_TIMESTEPS,
                log_interval=SAVE_FREQ,
                callback=callback)

    print("---------- Finish Learning ----------")
    model.save(final_model_path)
    del model  # remove to demonstrate saving and loading
    model = SAC.load(final_model_path)

    results_plotter.plot_results([logdir], TOTAL_TIMESTEPS,
                                 results_plotter.X_TIMESTEPS, "SAC BLIMP")
    plt.show()
Beispiel #9
0
pathlib.Path("./models").mkdir(exist_ok=True)
pathlib.Path("./models/checkpoints").mkdir(exist_ok=True)
env = WarehouseEnv('7x7_4bins_2items_2binslots_1agentslots')

model = DQN(CustomDQNPolicy,
            env,
            verbose=1,
            exploration_fraction=0.95,
            exploration_initial_eps=1,
            exploration_final_eps=0.05,
            batch_size=32,
            buffer_size=50000)

checkpoint_callback = CheckpointCallback(save_freq=50000,
                                         save_path='./models/checkpoints/',
                                         name_prefix=prefix)

#episode_plot_freq = n : Update plots every n time steps
#update_stats_every = m: Update stats used in plots every m Episodes
#Note! update_stats_every > 1 would lead to lose of information in the plot (not in the trining process), but increase the performance during training.
plt_callback = plotcallback(episode_plot_freq=10000,
                            update_stats_every=1,
                            average_size=100,
                            verbose=1,
                            plot_prefix=prefix,
                            plot_dir="./Plots")

callbacks = CallbackList([checkpoint_callback, plt_callback])

model.learn(total_timesteps=total_timesteps, callback=callbacks)