def train_sac(training_tag):
    env = gym.make(ENVIRONMENT_NAME)
    env = DummyVecEnv([lambda: env])

    if (isinstance(training_tag, float)):
        model = SAC(sac_MlpPolicy,
                    env,
                    ent_coef=training_tag,
                    verbose=1,
                    policy_kwargs=POLICY_KWARGS)
        for step in range(TRAINING_STEPS):
            env.reset()

            (model, learning_results) = model.learn(
                total_timesteps=TRAINING_TIMESTEPS, log_interval=100)

            file_tag = str(training_tag).replace(".", "p")
            if (SAVE_AGENTS):
                model.save("nchain/models/SAC_" + ENVIRONMENT_NAME + "_s" +
                           str(step) + "_t" + str(file_tag) + "_i" +
                           str(CURRENT_ITERATION) + "_ts" +
                           str(TRAINING_TIMESTEPS))

        if (SAVE_FINAL_AGENT):
            model.save("nchain/models/SAC_" + ENVIRONMENT_NAME + "_t" +
                       str(file_tag) + "_i" + str(CURRENT_ITERATION) + "_ts" +
                       str(TRAINING_STEPS * TRAINING_TIMESTEPS))

        env.reset()
        del model

    return data
Exemple #2
0
def model_training_learning(env_train, model_name, timesteps=100000):

    # train model
    os.chdir("./model_saved/" + model_name)
    start = time.time()
    print("Train ", model_name, " Model with MlpPolicy: ")

    if model_name == "A2C_Model":
        model = A2C('MlpPolicy', env_train, verbose=0)
    elif model_name == "PPO_Model":
        model = PPO2('MlpPolicy', env_train, verbose=0)
    elif model_name == "TD3_Model":
        model = TD3('MlpPolicy', env_train, verbose=0)
    elif model_name == "SAC_Model":
        model = SAC('MlpPolicy', env_train, verbose=0)

    print("Learning ", model_name, " time steps: ", timesteps)

    model.learn(total_timesteps=timesteps)
    print("TD3 Model learning completed: ")
    end = time.time()
    timestamp = time.strftime('%b-%d-%Y_%H%M')
    model_file_name = (model_name + timestamp)
    model.save(model_file_name)
    print("- ", model_name, " save finish     :")
    print("Training time  ", model_name, " : ", (end - start) / 60, " minutes")

    os.chdir("./..")
    os.chdir("./..")
    return model
Exemple #3
0
def run_experiment(verbose, tensorboard_log, learning_rate):
    pdb.set_trace()
    env = make_vec_env(
        'PointMassDense-%d-v1' % num_objs,
        1,
        wrapper_class=FlattenDictWrapper,
        wrapper_env_kwargs=['observation', 'achieved_goal', 'desired_goal'])
    env = VecVideoRecorder(
        env,
        osp.join(logger, "videos"),
        record_video_trigger=lambda x: x % save_video_interval == 0,
        video_length=save_video_length)

    n_actions = env.action_space.shape[-1]
    stddev = 0.2
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                     sigma=0.1 * np.ones(n_actions))

    model = SAC(
        MlpPolicy,
        env,
        verbose=verbose,
        tensorboard_log=logger,
        learning_rate=learning_rate,
        action_noise=action_noise,
    )
    model.learn(total_timesteps=int(nIter), log_interval=100)
    model.save(expDir + "/%s/%s_%s" %
               (name, np.format_float_scientific(nIter),
                np.format_float_scientific(learning_rate)))
    env.close()
Exemple #4
0
def func_run(env, logger, lr, action_noise, file):
    expDir = '/home/shivanik/lab/pointExp/state/'
    num_objs = 1

    verbose = 1
    name = 'sac_%d_0.5' % num_objs
    nIter = 5e7

    save_video_length = 200
    save_video_interval = 1000000
    env = VecVideoRecorder(
        env,
        osp.join(logger, "videos"),
        record_video_trigger=lambda x: x % save_video_interval == 0,
        video_length=save_video_length)
    model = SAC(
        MlpPolicy,
        env,
        verbose=verbose,
        tensorboard_log=logger,
        learning_rate=lr,
        action_noise=action_noise,
    )
    model.learn(total_timesteps=int(nIter), log_interval=100)
    exp_name = expDir + "/%s/%s_%s" % (name, np.format_float_scientific(nIter),
                                       np.format_float_scientific(lr))
    model.save(exp_name)
    file.write(exp_name + '\n')
    env.close()
    return True
Exemple #5
0
 def explore(app,
             emulator,
             appium,
             timesteps,
             timer,
             save_policy,
             policy_dir,
             cycle,
             train_freq=5,
             target_update_interval=10):
     try:
         env = TimeFeatureWrapper(app)
         model = SAC(MlpPolicy,
                     env,
                     verbose=1,
                     train_freq=train_freq,
                     target_update_interval=target_update_interval)
         callback = TimerCallback(timer=timer, app=app)
         model.learn(total_timesteps=timesteps, callback=callback)
         if save_policy:
             model.save(f'{policy_dir}{os.sep}{cycle}')
         return True
     except Exception as e:
         print(e)
         appium.restart_appium()
         if emulator is not None:
             emulator.restart_emulator()
         return False
Exemple #6
0
def main(argv):
    fixed = True

    policy_name = "sac_reaching_policy"

    obj_pose_rnd_std = 0 if fixed == True else 0.05
    pandaenv = pandaReachGymEnv(renders=True,
                                use_IK=0,
                                numControlledJoints=7,
                                obj_pose_rnd_std=obj_pose_rnd_std,
                                includeVelObs=True)
    n_actions = pandaenv.action_space.shape[-1]

    pandaenv = DummyVecEnv([lambda: pandaenv])

    model = SAC(MlpPolicy,
                pandaenv,
                gamma=0.9,
                batch_size=16,
                verbose=1,
                tensorboard_log="../pybullet_logs/pandareach_sac/")

    model.learn(total_timesteps=1000000)

    model.save("../pybullet_logs/pandareach_sac/" + policy_name)

    del model  # remove to demonstrate saving and loading
Exemple #7
0
def train_SAC(env_train, model_name, timesteps=50000):
    start = time.time()
    model = SAC('MlpPolicy', env_train, verbose=0)
    model.learn(total_timesteps=timesteps)
    end = time.time()

    model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}")
    print('Training time (SAC): ', (end - start) / 60, ' minutes')
    return model
def train(env_name,
          num_time_steps,
          policy_kwargs,
          eval_ep,
          eval_freq,
          ckpt_freq,
          load_model=None):
    env = gym.make(env_name)
    # env.render()
    env_ = gym.make(env_name)

    today = date.today()
    today = str(today).replace('-', '_')
    now = datetime.now()
    current_time = now.strftime("%H_%M_%S")
    model_name = env_name + '_SAC_' + today + current_time
    Path('./run/' + model_name).mkdir(parents=True, exist_ok=True)
    path = os.path.join(os.path.dirname(__file__), './run/' + model_name)
    env = Monitor(env, filename=path)
    ############################
    #          Logging         #
    ############################
    logger.configure(path)
    config = {}
    config['load'] = [{'load_model': load_model}]
    config['eval'] = [{'eval_freq': eval_freq, 'eval_ep': eval_ep}]
    config['ckpt'] = [{'ckpt_freq': ckpt_freq}]
    config['policy'] = [{'policy_network': policy_kwargs}]
    with open('./run/' + model_name + '/' + model_name + '.txt',
              'w+') as outfile:
        json.dump(config, outfile, indent=4)

    ############################
    #         callback         #
    ############################
    callbacklist = []
    ckpt_callback = CheckpointCallback(save_freq=ckpt_freq,
                                       save_path='./run/' + model_name +
                                       '/ckpt',
                                       name_prefix='')
    eval_callback = EvalCallback_wandb_SAC(env_,
                                           n_eval_episodes=eval_ep,
                                           eval_freq=eval_freq,
                                           log_path=path)
    callbacklist.append(ckpt_callback)
    callbacklist.append(eval_callback)
    callback = CallbackList(callbacklist)

    ############################
    #            run           #
    ############################
    # policy_kwargs = dict(net_arch=[128, dict(vf=[256], pi=[16])])
    model = SAC(MlpPolicy, env, verbose=1)
    model.learn(total_timesteps=int(num_time_steps),
                log_interval=20,
                callback=callback)
    model.save(path + "SAC_Walker2d")
Exemple #9
0
def train():
    machine = StateMachine()
    machine.initialize(headless=True)
    camera = Camera(machine)
    env = CustomEnv(machine, camera, state="vision")
    model = SAC(CnnPolicy, env, verbose=1, learning_starts=32, batch_size=32, \
                target_update_interval=32, tensorboard_log=dir_path+'/Logs/')
    model.learn(total_timesteps=2000, log_interval=1000000)
    model.save("Grasp_Model_Full_Pose")
Exemple #10
0
def train(learning_rate, time_steps, env, model_path):
    
    tf.reset_default_graph()    # to avoid the conflict the existnat parameters, but not suggested for reuse parameters


    # default policy is MlpPolicy
    model = SAC(CustomSACPolicy, env, verbose=1,seed=10, n_cpu_tf_sess=16)
    model.learn(total_timesteps=int(time_steps), log_interval=1000, callback=callback)
    model.save(model_path)
Exemple #11
0
def train():
    set_gpu()
    expDir = '/home/shivanik/lab/pointExp/state/'
    num_objs = 1

    verbose = 1
    name = 'sac_%d_0.5' % num_objs
    nIter = 1e8

    save_video_length = 200
    save_video_interval = 1000000
    file = open('sac_done.txt', 'w+')
    env = make_vec_env(
        'PointMassDense-%d-v1' % num_objs,
        1,
        wrapper_class=FlattenDictWrapper,
        wrapper_env_kwargs=['observation', 'achieved_goal', 'desired_goal'])
    n_actions = env.action_space.shape[-1]
    stddev = 0.2

    pool = multiprocessing.Pool(processes=4)
    for lr in [1e-5]:  #, 5e-4, 1e-5
        logger = osp.join(
            expDir, name, 'logs%s_%s' % (np.format_float_scientific(nIter),
                                         np.format_float_scientific(lr)))
        env = VecVideoRecorder(
            env,
            osp.join(logger, "videos"),
            record_video_trigger=lambda x: x % save_video_interval == 0,
            video_length=save_video_length)
        action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                         sigma=0.1 * np.ones(n_actions))

        # boo = pool.apply_async(func_run, args=(env, logger, lr, action_noise, file))
        model = SAC(
            MlpPolicy,
            env,
            verbose=verbose,
            tensorboard_log=logger,
            learning_rate=lr,
            action_noise=action_noise,
        )
        model.learn(total_timesteps=int(nIter), log_interval=100)
        exp_name = expDir + "/%s/%s_%s" % (name,
                                           np.format_float_scientific(nIter),
                                           np.format_float_scientific(lr))
        model.save(exp_name)
        file.write(exp_name + '\n')
        env.close()
    file.close()
    pool.close()
    pool.join()
Exemple #12
0
def train_GAIL(env_train, model_name, timesteps=1000):
    """GAIL Model"""
    #from stable_baselines.gail import ExportDataset, generate_expert_traj
    start = time.time()
    # generate expert trajectories
    model = SAC('MLpPolicy', env_train, verbose=1)
    generate_expert_traj(model, 'expert_model_gail', n_timesteps=100, n_episodes=10)

    # Load dataset
    dataset = ExpertDataset(expert_path='expert_model_gail.npz', traj_limitation=10, verbose=1)
    model = GAIL('MLpPolicy', env_train, dataset, verbose=1)

    model.learn(total_timesteps=1000)
    end = time.time()

    model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}")
    print('Training time (PPO): ', (end - start) / 60, ' minutes')
    return model
Exemple #13
0
def train_SAC(env_train, model_name, timesteps=100000):

    # train SAC model
    os.chdir("./model_saved/")
    start = time.time()
    print("Train SAC Model with MlpPolicy: ")

    model = SAC('MlpPolicy', env_train, verbose=0)
    print("SAC Learning time steps: ", timesteps)
    model.learn(total_timesteps=timesteps)
    print("SAC Model learning completed: ")

    end = time.time()
    timestamp = time.strftime('%b-%d-%Y_%H%M')
    model_file_name = (model_name + timestamp)
    model.save(model_file_name)
    print("SAC Model save finish     :")
    print('Training time SAC: ', (end - start) / 60, ' minutes')
    os.chdir("./..")

    return model
    def train_SAC(self, model_name, model_params=config.SAC_PARAMS):
        """TD3 model"""
        from stable_baselines import SAC

        env_train = self.env

        start = time.time()
        model = SAC(
            'MlpPolicy',
            env_train,
            batch_size=model_params['batch_size'],
            buffer_size=model_params['buffer_size'],
            learning_rate=model_params['learning_rate'],
            learning_starts=model_params['learning_starts'],
            ent_coef=model_params['ent_coef'],
            verbose=model_params['verbose'],
            tensorboard_log=f"{config.TENSORBOARD_LOG_DIR}/{model_name}")
        model.learn(total_timesteps=model_params['timesteps'],
                    tb_log_name="SAC_run")
        end = time.time()

        model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}")
        print('Training time (SAC): ', (end - start) / 60, ' minutes')
        return model
Exemple #15
0
                     specific_env_len=70,
                     s_len=150,
                     walls=True,
                     target_vel=params["target_vel"],
                     use_contacts=params["use_contacts"])

        model = SAC('MlpPolicy',
                    env,
                    learning_rate=3e-3,
                    verbose=1,
                    batch_size=64,
                    tensorboard_log="/tmp",
                    gamma=0.99)
        model.learn(total_timesteps=int(params["steps"]))
        print("Done learning, saving model")
        model.save("agents/SBL_{}".format(params["ID"]))
        print("Saved model, closing env")
        env.close()
        print("Finished training with ID: {}".format(ID))
    else:
        env = env_id(params["env_list"],
                     max_n_envs=1,
                     specific_env_len=70,
                     s_len=150,
                     walls=True,
                     target_vel=params["target_vel"],
                     use_contacts=params["use_contacts"])

        print("Testing")
        policy_name = "H02"  # LX3, 63W (tiles): joints + contacts + yaw
        policy_path = 'agents/SBL_{}'.format(policy_name)
Exemple #16
0
import gym
from stable_baselines.sac.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import SAC
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

from env.cache_env import cache_env

import pandas as pd

df = pd.read_csv('./data/requests.csv')

# The algorithms require a vectorized environment to run
env = DummyVecEnv([lambda: cache_env(df)])

model = SAC(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=40000)

#
model.save("SAC_new")
#%%
#
# load model
#model = SAC.load("SAC")
obs = env.reset()
for i in range(500):
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    env.render()
def train_initial_policy(
        model_name,
        algo=ALGO,
        env_name=ENV_NAME,
        time_steps=TIME_STEPS):
    """Uses the specified algorithm on the target environment"""
    print("Using algorithm : ", algo.__name__)
    print("Model saved as : ", "data/models/" +algo.__name__+"_initial_policy_"+env_name+"_.pkl")

    # define the environment here
    env = gym.make(env_name)
    env.seed(SEED)
    if NOISE_VALUE>0 : env = NoisyRealEnv(env, noise_value=NOISE_VALUE)

    if MUJOCO_NORMALIZE:
        env = MujocoNormalized(env)

    print('~~ ENV Obs RANGE : ', env.observation_space.low, env.observation_space.high)
    print('~~~ ENV Action RANGE : ', env.action_space.low, env.action_space.high)

    if algo.__name__  == "ACKTR":
        print('Using SubprovVecEnv')
        env = SubprocVecEnv([lambda: env for i in range(8)])
    elif algo.__name__ == "SAC":
        print('Using standard gym environment')
        env = env
    else:
        print('Using Dummy Vec Env')
        env = DummyVecEnv([lambda : env])

    if NORMALIZE :
        env = VecNormalize(env,
                           training=True,
                           norm_obs=True,
                           norm_reward=False,
                           clip_reward=1e6,
                           )


    with open('data/target_policy_params.yaml') as file:
        args = yaml.load(file, Loader=yaml.FullLoader)
    args = args[algo.__name__][PARAMS_ENV]
    print('~~ Loaded args file ~~')

    if algo.__name__ == "SAC":
        print('Initializing SAC with RLBaselinesZoo hyperparameters .. ')
        print('using 256 node architecture as in the paper')

        class CustomPolicy(ffp_sac):
            def __init__(self, *args, **kwargs):
                super(CustomPolicy, self).__init__(*args, **kwargs,
                                                   feature_extraction="mlp", layers=[256, 256])

        model = SAC(CustomPolicy, env,
                    verbose=1,
                    tensorboard_log='data/TBlogs/initial_policy_training',
                    batch_size=args['batch_size'],
                    buffer_size=args['buffer_size'],
                    ent_coef=args['ent_coef'],
                    learning_starts=args['learning_starts'],
                    learning_rate=args['learning_rate'],
                    train_freq=args['train_freq'],
                    seed=SEED,
                    )
    elif algo.__name__ == "TD3":
        print('Initializing TD3 with RLBaselinesZoo hyperparameters .. ')
        # hyperparameters suggestions from :
        # https://github.com/araffin/rl-baselines-zoo/blob/master/trained_agents/td3/HopperBulletEnv-v0/config.yml
        n_actions = env.action_space.shape[-1]
        action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                         sigma=float(args['noise_std']) * np.ones(n_actions))
        class CustomPolicy2(ffp_td3):
            def __init__(self, *args, **kwargs):
                super(CustomPolicy2, self).__init__(*args, **kwargs,
                                                   feature_extraction="mlp", layers=[400, 300])
        model = TD3(CustomPolicy2, env,
                    verbose = 1,
                    tensorboard_log = 'data/TBlogs/initial_policy_training',
                    batch_size = args['batch_size'],
                    buffer_size = args['buffer_size'],
                    gamma = args['gamma'],
                    gradient_steps = args['gradient_steps'],
                    learning_rate = args['learning_rate'],
                    learning_starts = args['learning_starts'],
                    action_noise = action_noise,
                    train_freq=args['train_freq'],
                    seed=SEED,
                    )

    elif algo.__name__ == "TRPO":
        print('Initializing TRPO with RLBaselinesZoo hyperparameters .. ')
        # hyperparameters suggestions from :
        # https://github.com/araffin/rl-baselines-zoo/blob/master/trained_agents/sac/HopperBulletEnv-v0/config.yml
        model = TRPO(mlp_standard, env,
                    verbose=1,
                    tensorboard_log='data/TBlogs/initial_policy_training',
                    timesteps_per_batch=args['timesteps_per_batch'],
                    lam=args['lam'],
                    max_kl=args['max_kl'],
                    gamma=args['gamma'],
                    vf_iters=args['vf_iters'],
                    vf_stepsize=args['vf_stepsize'],
                    entcoeff=args['entcoeff'],
                    cg_damping=args['cg_damping'],
                    cg_iters=args['cg_iters'],
                     seed=SEED,
                    )

    elif algo.__name__ == "ACKTR":
        print('Initializing ACKTR')
        model = ACKTR(mlp_standard,
                      env,
                      verbose=1,
                      n_steps=128,
                      ent_coef=0.01,
                      lr_schedule='constant',
                      learning_rate=0.0217,
                      max_grad_norm=0.5,
                      gamma=0.99,
                      vf_coef=0.946,
                      seed=SEED)

    elif algo.__name__ == "PPO2":
        print('Initializing PPO2')
        print('Num envs : ', env.num_envs)
        model = PPO2(mlp_standard,
                     env,
                     n_steps=int(args['n_steps']/env.num_envs),
                     nminibatches=args['nminibatches'],
                     lam=args['lam'],
                     gamma=args['gamma'],
                     ent_coef=args['ent_coef'],
                     noptepochs=args['noptepochs'],
                     learning_rate=args['learning_rate'],
                     cliprange=args['cliprange'],
                     verbose=1,
                     tensorboard_log='data/TBlogs/initial_policy_training',
                     seed=SEED,
                     )

    else:
        print('No algorithm matched. Using SAC .. ')
        model = SAC(CustomPolicy, env,
                    verbose=1,
                    batch_size=args['batch_size'],
                    buffer_size=args['buffer_size'],
                    ent_coef=args['ent_coef'],
                    learning_starts=args['learning_starts'],
                    learning_rate=args['learning_rate'],
                    train_freq=args['train_freq'],
                    seed=SEED,
                    )

    # change model name if using normalization
    if NORMALIZE:
        model_name = model_name.replace('.pkl', 'normalized_.pkl')

    elif MUJOCO_NORMALIZE:
        model_name = model_name.replace('.pkl', 'mujoco_norm_.pkl')

    if SAVE_BEST_FOR_20:
        model.learn(total_timesteps=time_steps,
                    tb_log_name=model_name,
                    log_interval=10,
                    callback=eval_callback)
        save_the_model()
        model_name = model_name.replace('best_', '')
        model.save(model_name)
    elif SAVE_INTERMEDIATE:
        check_callback = CheckpointCallback(save_freq=SAVE_FREQ,
                                            save_path=model_name[:-4],
                                            name_prefix=ENV_NAME + '_' + str(SEED),
                                            verbose=1,
                                            )
        eval_env = DummyVecEnv([lambda: gym.make(ENV_NAME)])
        eval_env.seed(SEED)
        eval_callback = EvalCallback(eval_env,
                                     n_eval_episodes=10,
                                     eval_freq=SAVE_FREQ,
                                     log_path=model_name[:-4],
                                     deterministic=False,
                                     render=False,
                                     verbose=1)

        callbacks = CallbackList([check_callback, eval_callback])
        model.learn(total_timesteps=time_steps,
                    tb_log_name=model_name.split('/')[-1],
                    log_interval=10,
                    callback=callbacks)
        model.save(model_name)
        npzfile = np.load(model_name[:-4] + '/evaluations.npz')
        average_rewards = np.mean(npzfile['results'], axis=1)[:, 0]
        with open(model_name[:-4] + "/eval_results.txt", "a") as f:
            for i in range(np.shape(average_rewards)[0]):
                f.write("{}, {}\n".format(npzfile['timesteps'][i], average_rewards[i]))
        evaluate_policy_on_env(env, model, render=False, iters=50)
    else:
        model.learn(total_timesteps=time_steps,
                    tb_log_name=model_name.split('/')[-1],
                    log_interval=10,)
        model.save(model_name)
        evaluate_policy_on_env(env, model, render=False, iters=50)

    # save the environment params
    if NORMALIZE:
        # env.save(model_name.replace('.pkl', 'stats_.pkl'))
        env.save('data/models/env_stats/'+env_name+'.pkl')

    print('done :: ', model_name)
    exit()
Exemple #18
0
def expert(obs):
    try:
        state = State(env_depth, env_width).load_obs(obs)
        return get_behav(state, weights={'fr': 0.3})
    except NoPathError:
        return np.zeros(env_depth * 2)


# generate_expert_traj(expert, 'expert', Env(env_depth, env_width, nlayers), n_episodes=100)

# pretrain model
dataset = ExpertDataset(expert_path='expert.npz')
model = SAC('MlpPolicy', Env(env_depth, env_width, nlayers), verbose=1)
model.pretrain(dataset, n_epochs=5000)
model.save('pretrained_sac')

# Test the pre-trained model
env = model.get_env()
obs = env.reset()

reward_sum = 0
i = 0
for j in range(1000):
    action, _ = model.predict(obs)
    obs, reward, done, _ = env.step(action)
    reward_sum += reward
    i += 1
    if done:
        print(reward_sum, i, reward_sum / i)
        reward_sum = 0
Exemple #19
0
print('Environment created...')
# Check how many episodes the model has been trained for:
if os.path.exists(epFile):
    currentEp = read_file(epFile)
    print('Resuming training from episode {}'.format(currentEp))

#  Check if training log of previous training exists, load model if does
if currentEp <= maxEpisodes:
    if os.path.exists('logs/' + simName + '.txt'):
        print('Loading previous model...')
        model = SAC.load(simLogPath + simName,
                         env,
                         tensorboard_log=tensorboardPath)
    else:
        print('Creating new model...')
        model = SAC(MlpPolicy, env, verbose=1, tensorboard_log=tensorboardPath)
    model.learn(total_timesteps=timeStepsPerLoad, log_interval=1)
    print('Training finished...')
    model.save(simLogPath + 'models/' + simName)
    print('Model saved...')
    env.SaveAndQuit()

model = SAC.load(simName, env, tensorboard_log=tensorboardPath)
obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
    if dones:
        env.reset()
Exemple #20
0
def test_agent(agent_step):
    now = time.time()
    for coef_index in range(len(CLAC_COEFS)):

        mut_coef = CLAC_COEFS[coef_index]
        ent_coef = SAC_COEFS[coef_index]
        training_timestep = 0

        clac_env = gym.make(ENVIRONMENT_NAME)
        clac_env = DummyVecEnv([lambda: clac_env])
        clac_model = CLAC(CLAC_MlpPolicy, clac_env, mut_inf_coef=mut_coef, verbose=1)

        sac_env = gym.make(ENVIRONMENT_NAME)
        sac_env = DummyVecEnv([lambda: sac_env])

        sac_model = SAC(MlpPolicy, sac_env, ent_coef=ent_coef, verbose=1)

        mirl_env = gym.make(ENVIRONMENT_NAME)
        mirl_env = DummyVecEnv([lambda: mirl_env])

        mirl_model = CLAC(CLAC_MlpPolicy, mirl_env, mut_inf_coef=mut_coef, coef_schedule=3.3e-3, verbose=1)
        
        for resample_step in range(0, NUM_RESAMPLES):
            features = pd.DataFrame()

            if(agent_step == 1):
                print(mut_coef,  "  ",  ent_coef, "  ", NUM_TRAINING_STEPS, "  ",  ENVIRONMENT_NAME, "  ", FOLDER, " ", resample_step)

            (clac_model, learning_results) = clac_model.learn(total_timesteps=NUM_TRAINING_STEPS, log_interval=1000)
            (sac_model, learning_results) = sac_model.learn(total_timesteps=NUM_TRAINING_STEPS, log_interval=1000)
            (mirl_model, learning_results) = mirl_model.learn(total_timesteps=NUM_TRAINING_STEPS, log_interval=1000)

            # Save models 
            clac_model.save(FOLDER + "/Training/models/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step))
            sac_model.save(FOLDER + "/Training/models/CLAC_" + str(ent_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step))
            mirl_model.save(FOLDER + "/Training/models/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step))

            training_timestep += NUM_TRAINING_STEPS

            # Test Normal 
            eval_results = eval_model(clac_model, clac_env, "CLAC", mut_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 0)
            eval_results.to_pickle(FOLDER + "/Training/results/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl")

            eval_results = eval_model(sac_model, sac_env, "SAC", ent_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 0)
            eval_results['AgentID'] = agent_step
            eval_results.to_pickle(FOLDER + "/Training/results/SAC_" + str(ent_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl")

            eval_results = eval_model(mirl_model, mirl_env, "MIRL", mut_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 0)
            eval_results['AgentID'] = agent_step
            eval_results.to_pickle(FOLDER + "/Training/results/MIRL_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl")

            # Test generalization 
            eval_results = eval_model(clac_model, clac_env, "CLAC", mut_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 1)
            eval_results['AgentID'] = agent_step
            eval_results.to_pickle(FOLDER + "/Generalization/results/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl")

            eval_results = eval_model(sac_model, sac_env, "SAC", ent_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 1)
            eval_results['AgentID'] = agent_step
            eval_results.to_pickle(FOLDER + "/Generalization/results/SAC_" + str(ent_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl")

            eval_results = eval_model(mirl_model, mirl_env, "MIRL", mut_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 1)
            eval_results['AgentID'] = agent_step
            eval_results.to_pickle(FOLDER + "/Generalization/results/MIRL_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl")

            # Test generalization Extreme
            eval_results = eval_model(clac_model, clac_env, "CLAC", mut_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 2)
            eval_results['AgentID'] = agent_step
            eval_results.to_pickle(FOLDER + "/Extreme/results/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl")

            eval_results = eval_model(sac_model, sac_env, "SAC", ent_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 2)
            eval_results['AgentID'] = agent_step
            eval_results.to_pickle(FOLDER + "/Extreme/results/SAC_" + str(ent_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl")

            eval_results = eval_model(mirl_model, mirl_env, "MIRL", mut_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 2)
            eval_results['AgentID'] = agent_step
            eval_results.to_pickle(FOLDER + "/Extreme/results/MIRL_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl")

            clac_env.env_method("reset_features")
            sac_env.env_method("reset_features")
            mirl_env.env_method("reset_features")
        
        del sac_model
        del sac_env

        del clac_model
        del clac_env
        
        del mirl_model
        del mirl_env

    later = time.time()
    difference = int(later - now)
    print("Tested Agent Time: ", difference)
Exemple #21
0
from stable_baselines import SAC

import os

#GPU isolation
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

env = gym.make('BipedalWalker-v2')
#env = DummyVecEnv([lambda: env])

model = SAC(MlpPolicy, env, verbose=1, learning_starts=1000)
#model = SAC(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=1000, log_interval=10)
model.save("sac_bipedalwalker")
print("Model saved to sac_bipedalwalkwer")

del model  # remove to demonstrate saving and loading

for i in range(100):
    print("experiment id: ", i)
    model = SAC.load("sac_bipedalwalker",
                     env=env,
                     tensorboard_log="./sac_bipedalwalker_tensorboard/")
    print("loaded")
    model.learn(total_timesteps=500000, log_interval=50)
    print("learned again")
obs = env.reset()

#while True:
Exemple #22
0
import gym
import gym_turtlebot3
import rospy

from stable_baselines.sac.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import SAC

env_name = 'TurtleBot3_Circuit_Simple_Continuous-v0'

rospy.init_node(env_name.replace('-', '_'))

env = gym.make(env_name)
env = DummyVecEnv([lambda: env])

model = SAC(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=int(1e4), log_interval=10)
model.save(env_name)
Exemple #23
0
            log.logger.warning(
                f"No model was found for version {model_name}. Training a new model with name {model_name}."
            )
            mode = 'train'

    if mode == 'train':
        env.reset()
        env.agg.case = 'rl_agg'
        model = SAC(LnMlpPolicy,
                    env,
                    learning_rate=0.03,
                    verbose=1,
                    tensorboard_log="tensorboard_logs")
        # note that the env won't record MPCCalc output for the training period
        model.learn(total_timesteps=5000, tb_log_name=model_name)
        model.save(model_name)

    obs = env.reset()
    env.agg.case = 'rl_agg'
    for t in range(1, num_steps + 1):
        action, _state = model.predict(obs)
        obs, reward, done, info = env.step(action)
        if (t % checkpoint_interval == 0) or (t == num_steps):
            env.agg.write_outputs()

if 'dn' in run:
    env.agg.config['agg']['tou_enabled'] = False
    env.agg.config['agg']['base_price'] = 0.1
    env.agg._build_tou_price()
    env.agg.redis_add_all_data()
    for h in env.agg.all_homes_obj:
Exemple #24
0
    def objective(trial):

        kwargs = hyperparams.copy()

        trial.model_class = None

        kwargs.update(sample_sac_params(trial))

        def callback(_locals, _globals):
            """
            Callback for monitoring learning progress.
            :param _locals: (dict)
            :param _globals: (dict)
            :return: (bool) If False: stop training
            """
            self_ = _locals['self']
            trial = self_.trial

            # Initialize variables
            if not hasattr(self_, 'is_pruned'):
                self_.is_pruned = False
                self_.last_mean_test_reward = -np.inf
                self_.last_time_evaluated = 0
                self_.eval_idx = 0

            if (self_.num_timesteps -
                    self_.last_time_evaluated) < evaluate_interval:
                return True

            self_.last_time_evaluated = self_.num_timesteps

            # Evaluate the trained agent on the test env
            rewards = []
            n_steps_done, reward_sum = 0, 0.0

            # Sync the obs rms if using vecnormalize
            # NOTE: this does not cover all the possible cases
            if isinstance(self_.test_env, VecNormalize):
                self_.test_env.obs_rms = deepcopy(self_.env.obs_rms)
                self_.test_env.ret_rms = deepcopy(self_.env.ret_rms)
                # Do not normalize reward
                self_.test_env.norm_reward = False

            obs = self_.test_env.reset()
            while n_steps_done < n_test_steps:
                # Use default value for deterministic
                action, _ = self_.predict(obs, )
                obs, reward, done, _ = self_.test_env.step(action)
                reward_sum += reward
                n_steps_done += 1

                if done:
                    rewards.append(reward_sum)
                    reward_sum = 0.0
                    obs = self_.test_env.reset()
            rewards.append(reward_sum)
            mean_reward = np.mean(rewards)
            summary = tf.Summary(value=[
                tf.Summary.Value(tag='evaluation', simple_value=mean_reward)
            ])
            _locals['writer'].add_summary(summary, self_.num_timesteps)
            self_.last_mean_test_reward = mean_reward
            self_.eval_idx += 1

            # report best or report current ?
            # report num_timesteps or elasped time ?
            trial.report(-1 * mean_reward, self_.eval_idx)
            # Prune trial if need
            if trial.should_prune(self_.eval_idx):
                self_.is_pruned = True
                return False

            return True

        commands = [[1, 0], [2, 0], [3, 0]]
        env = DummyVecEnv([
            lambda: e.AidaBulletEnv(commands,
                                    render=False,
                                    on_rack=False,
                                    default_reward=2,
                                    height_weight=5,
                                    orientation_weight=3,
                                    direction_weight=2,
                                    speed_weight=4)
        ])

        model = SAC(MlpPolicy,
                    env,
                    gamma=kwargs['gamma'],
                    learning_rate=kwargs['learning_rate'],
                    batch_size=kwargs['batch_size'],
                    buffer_size=kwargs['buffer_size'],
                    learning_starts=kwargs['learning_starts'],
                    train_freq=kwargs['train_freq'],
                    gradient_steps=kwargs['gradient_steps'],
                    ent_coef=kwargs['ent_coef'],
                    target_entropy=kwargs['target_entropy'],
                    policy_kwargs=kwargs['policy_kwargs'],
                    tensorboard_log="./optimisationSAC/logOPTI")

        model.test_env = DummyVecEnv([
            lambda: e.AidaBulletEnv(commands,
                                    render=False,
                                    on_rack=False,
                                    default_reward=2,
                                    height_weight=5,
                                    orientation_weight=3,
                                    direction_weight=2,
                                    speed_weight=4)
        ])

        model.trial = trial

        try:
            model.learn(n_timesteps,
                        callback=callback,
                        tb_log_name="SAC_" + str(trial.number))
            # Free memory
            model.env.close()
            model.test_env.close()
        except AssertionError:
            # Sometimes, random hyperparams can generate NaN
            # Free memory
            model.env.close()
            model.test_env.close()
            raise
        is_pruned = False
        cost = np.inf
        if hasattr(model, 'is_pruned'):
            is_pruned = model.is_pruned
            cost = -1 * model.last_mean_test_reward
        try:
            os.mkdir("./optimisationSAC/resultats/" + str(trial.number))
        except FileExistsError:
            print("Directory already exists")

        model.save("./optimisationSAC/resultats/" + str(trial.number) + "/" +
                   str(trial.number))

        del model.env, model.test_env
        del model

        if is_pruned:
            try:
                # Optuna >= 0.19.0
                raise optuna.exceptions.TrialPruned()
            except AttributeError:
                raise optuna.structs.TrialPruned()

        return cost
import gym
import rlbench.gym
from stable_baselines.sac.policies import MlpPolicy
from stable_baselines import SAC
import os
dir_path = os.path.dirname(os.path.realpath(__file__))

env = gym.make("empty_container-state-v0",render_mode="human",observation_mode='vision')
model = SAC(MlpPolicy, env, verbose=1, tensorboard_log=dir_path+'/Logs/')
model.learn(total_timesteps=1000)
model.save("sac_ec")
import gym
from stable_baselines.sac.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import SAC
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

from env.cache_env import cache_env

import pandas as pd

df = pd.read_csv('./data/requests.csv')

# The algorithms require a vectorized environment to run
env = DummyVecEnv([lambda: cache_env(df)])

model = SAC(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=40000)
model.save("SAC")

obs = env.reset()
for i in range(20000):
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    env.render()
import gym
import numpy as np
import imageio

from stable_baselines.sac.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import SAC

env = gym.make('Pendulum-v0')
env = DummyVecEnv([lambda: env])

model = SAC(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=1000, log_interval=10)

model.save("../models/sac_pendulum")

del model  # remove to demonstrate saving and loading

model = SAC.load("../models/sac_pendulum")

#obs = env.reset()
#while True:
#    action, _states = model.predict(obs)
#    obs, rewards, dones, info = env.step(action)
#    env.render()
def train_initial_policy(model_name,
                         algo=ALGO,
                         env_name=ENV_NAME,
                         time_steps=TIME_STEPS):
    """Uses the specified algorithm on the target environment"""
    print("Using algorithm : ", algo.__name__)
    print(
        "Model saved as : ", "data/models/" + algo.__name__ +
        "_initial_policy_" + env_name + "_.pkl")
    constrained = False

    # define the environment here
    env = gym.make(env_name)
    if NOISE_VALUE > 0: env = NoisyRealEnv(env, noise_value=NOISE_VALUE)

    if MUJOCO_NORMALIZE:
        env = MujocoNormalized(env)

    print('~~ ENV Obs RANGE : ', env.observation_space.low,
          env.observation_space.high)
    print('~~~ ENV Action RANGE : ', env.action_space.low,
          env.action_space.high)

    if TIMEWRAPPER:
        # env = TimeFeatureWrapper(env)
        env = TimeLimit(env, 1000)

    if algo.__name__ == "ACKTR":
        print('Using SubprovVecEnv')
        env = SubprocVecEnv([lambda: env for i in range(8)])
    elif algo.__name__ == "SAC":
        print('Using standard gym environment')
        env = env
    else:
        print('Using Dummy Vec Env')
        env = DummyVecEnv([lambda: env])

    if NORMALIZE:
        env = VecNormalize(
            env,
            training=True,
            norm_obs=True,
            norm_reward=False,
            clip_reward=1e6,
        )

    with open('data/target_policy_params.yaml') as file:
        args = yaml.load(file, Loader=yaml.FullLoader)
    args = args[algo.__name__][PARAMS_ENV]
    print('~~ Loaded args file ~~')

    if algo.__name__ == "SAC":
        print('Initializing SAC with RLBaselinesZoo hyperparameters .. ')
        print('using 256 node architecture as in the paper')

        class CustomPolicy(ffp_sac):
            def __init__(self, *args, **kwargs):
                super(CustomPolicy, self).__init__(*args,
                                                   **kwargs,
                                                   feature_extraction="mlp",
                                                   layers=[256, 256])

        model = SAC(
            CustomPolicy,
            env,
            verbose=1,
            tensorboard_log='data/TBlogs/initial_policy_training',
            batch_size=args['batch_size'],
            buffer_size=args['buffer_size'],
            ent_coef=args['ent_coef'],
            learning_starts=args['learning_starts'],
            learning_rate=args['learning_rate'],
            train_freq=args['train_freq'],
        )
    elif algo.__name__ == "TD3":
        print('Initializing TD3 with RLBaselinesZoo hyperparameters .. ')
        # hyperparameters suggestions from :
        # https://github.com/araffin/rl-baselines-zoo/blob/master/trained_agents/td3/HopperBulletEnv-v0/config.yml
        n_actions = env.action_space.shape[-1]
        action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                         sigma=float(args['noise_std']) *
                                         np.ones(n_actions))

        class CustomPolicy2(ffp_td3):
            def __init__(self, *args, **kwargs):
                super(CustomPolicy2, self).__init__(*args,
                                                    **kwargs,
                                                    feature_extraction="mlp",
                                                    layers=[400, 300])

        model = TD3(
            CustomPolicy2,
            env,
            verbose=1,
            tensorboard_log='data/TBlogs/initial_policy_training',
            batch_size=args['batch_size'],
            buffer_size=args['buffer_size'],
            gamma=args['gamma'],
            gradient_steps=args['gradient_steps'],
            learning_rate=args['learning_rate'],
            learning_starts=args['learning_starts'],
            action_noise=action_noise,
            train_freq=args['train_freq'],
        )

    elif algo.__name__ == "TRPO":
        print('Initializing TRPO with RLBaselinesZoo hyperparameters .. ')
        # hyperparameters suggestions from :
        # https://github.com/araffin/rl-baselines-zoo/blob/master/trained_agents/sac/HopperBulletEnv-v0/config.yml
        model = TRPO(mlp_standard,
                     env,
                     verbose=1,
                     tensorboard_log='data/TBlogs/initial_policy_training',
                     timesteps_per_batch=args['timesteps_per_batch'],
                     lam=args['lam'],
                     max_kl=args['max_kl'],
                     gamma=args['gamma'],
                     vf_iters=args['vf_iters'],
                     vf_stepsize=args['vf_stepsize'],
                     entcoeff=args['entcoeff'],
                     cg_damping=args['cg_damping'],
                     cg_iters=args['cg_iters'])

    elif algo.__name__ == "ACKTR":
        print('Initializing ACKTR')
        model = ACKTR(mlp_standard,
                      env,
                      verbose=1,
                      n_steps=128,
                      ent_coef=0.01,
                      lr_schedule='constant',
                      learning_rate=0.0217,
                      max_grad_norm=0.5,
                      gamma=0.99,
                      vf_coef=0.946)

    elif algo.__name__ == "PPO2":
        print('Initializing PPO2')
        print('Num envs : ', env.num_envs)
        model = PPO2(
            mlp_standard,
            env,
            n_steps=int(args['n_steps'] / env.num_envs),
            nminibatches=args['nminibatches'],
            lam=args['lam'],
            gamma=args['gamma'],
            ent_coef=args['ent_coef'],
            noptepochs=args['noptepochs'],
            learning_rate=args['learning_rate'],
            cliprange=args['cliprange'],
            verbose=1,
            tensorboard_log='data/TBlogs/initial_policy_training',
        )

    elif algo.__name__ == "TRPO_lagrangian":
        print(
            'Initializing TRPO-lagrangian with safety-starter-agents hyperparameters .. '
        )

        model = TRPO_lagrangian(
            MLPWithSafeValue,
            env,
            verbose=1,
            tensorboard_log='data/TBlogs/initial_policy_training',
            timesteps_per_batch=args['timesteps_per_batch'],
            lam=args['lam'],
            max_kl=args['max_kl'],
            gamma=args['gamma'],
            vf_iters=args['vf_iters'],
            vf_stepsize=args['vf_stepsize'],
            entcoeff=args['entcoeff'],
            cg_damping=args['cg_damping'],
            cg_iters=args['cg_iters'],
            cost_lim=args['cost_lim'],
            penalty_init=args['penalty_init'],
            penalty_lr=args['penalty_lr'])
        constrained = True
    else:
        print('No algorithm matched. Using SAC .. ')
        model = SAC(
            CustomPolicy,
            env,
            verbose=1,
            batch_size=args['batch_size'],
            buffer_size=args['buffer_size'],
            ent_coef=args['ent_coef'],
            learning_starts=args['learning_starts'],
            learning_rate=args['learning_rate'],
            train_freq=args['train_freq'],
        )

    # change model name if using normalization
    if NORMALIZE:
        model_name = model_name.replace('.pkl', 'normalized_.pkl')

    elif MUJOCO_NORMALIZE:
        model_name = model_name.replace('.pkl', 'mujoco_norm_.pkl')

    if SAVE_BEST_FOR_20:
        model.learn(total_timesteps=time_steps,
                    tb_log_name=model_name,
                    log_interval=10,
                    callback=eval_callback)
        save_the_model()
        model_name = model_name.replace('best_', '')
        model.save(model_name)

    else:
        model.learn(
            total_timesteps=time_steps,
            tb_log_name=model_name.split('/')[-1],
            log_interval=10,
        )
        model.save(model_name)
        evaluate_policy_on_env(env,
                               model,
                               render=False,
                               iters=10,
                               constrained=constrained)

    # save the environment params
    if NORMALIZE:
        # env.save(model_name.replace('.pkl', 'stats_.pkl'))
        env.save('data/models/env_stats/' + env_name + '.pkl')

    print('done :: ', model_name)
    exit()
Exemple #29
0
    if train:
        for i in range(model_num):
            model.learn(total_timesteps=total_timesteps_,
                        tb_log_name=tensorboard_log_name)
            model.save(model_save_name)
elif algorithm == "SAC":
    from stable_baselines.sac.policies import MlpPolicy
    from stable_baselines import SAC
    env = gym.make(env_name)
    model = SAC(MlpPolicy, env, verbose=1, tensorboard_log=tensorboard_log_dir)
    if train:
        for i in range(model_num):
            model.learn(total_timesteps=total_timesteps_,
                        log_interval=1,
                        tb_log_name=tensorboard_log_name)
            model.save(model_save_name)
elif algorithm == "DDPG":
    if train:
        for i in range(model_num):
            from stable_baselines.ddpg.policies import MlpPolicy
            from stable_baselines.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise, AdaptiveParamNoiseSpec
            from stable_baselines import DDPG
            env = gym.make(env_name)

            # the noise objects for DDPG
            n_actions = env.action_space.shape[-1]
            param_noise = None
            action_noise = OrnsteinUhlenbeckActionNoise(
                mean=np.zeros(n_actions),
                sigma=float(0.5) * np.ones(n_actions))
Exemple #30
0
	names, values= val[0],val[1]
	for i in range(len(values)):
		sql = ''' INSERT INTO parameters(simu, type, step, value)
			VALUES(?,?,?,?) '''
		val = (model_name, names[i], 0, float(values[i]))
		cur.execute(sql,val)
		conn.commit()
	cur.close()
	conn.close()
	
	
	for i in range(args.total_steps//args.save_every):
		model.learn(total_timesteps=args.save_every, tb_log_name=model_name, reset_num_timesteps=False, callback=callback)
		if normalize:
			env.save_running_average(workDirectory+"/resultats/"+model_name+"/normalizeData")
		model.save(workDirectory+"/resultats/"+model_name+"/"+model_name)
		os.system("python3 makegif.py --algo "+args.algo+" --dir ./server/assets/"+model_name+"_"+str((i+1)*args.save_every)+"_steps.gif --name "+model_name)
		print("\n saved at "+str((i+1)*args.save_every))
	model.save(workDirectory+"/resultats/"+model_name+"/"+model_name)	
	if normalize:
		env.save_running_average(workDirectory+"/resultats/"+model_name+"/normalizeData")
	env = DummyVecEnv([lambda:  e.AidaBulletEnv(commands,
													  render  = False, 
													  on_rack = False,
													  default_reward     = args.default_reward,
													  height_weight      = args.height_weight,
													  orientation_weight = args.orientation_weight,
													  direction_weight   = args.direction_weight,

													  speed_weight       = args.speed_weight,
													  mimic_weight       = args.mimic_weight,