def explore(app, emulator, appium, timesteps, timer, save_policy, policy_dir, cycle, train_freq=5, target_update_interval=10): try: env = TimeFeatureWrapper(app) model = SAC(MlpPolicy, env, verbose=1, train_freq=train_freq, target_update_interval=target_update_interval) callback = TimerCallback(timer=timer, app=app) model.learn(total_timesteps=timesteps, callback=callback) if save_policy: model.save(f'{policy_dir}{os.sep}{cycle}') return True except Exception as e: print(e) appium.restart_appium() if emulator is not None: emulator.restart_emulator() return False
def train_policy(num_of_envs, log_relative_path, maximum_episode_length, skip_frame, seed_num, sac_config, total_time_steps, validate_every_timesteps, task_name): task = generate_task(task_generator_id=task_name, dense_reward_weights=np.array( [250, 0, 125, 0, 750, 0, 0, 0.005]), fractional_reward_weight=1, goal_height=0.15, tool_block_mass=0.02) env = CausalWorld(task=task, skip_frame=skip_frame, enable_visualization=False, seed=seed_num, max_episode_length=maximum_episode_length) set_global_seeds(seed_num) policy_kwargs = dict(layers=[256, 256]) checkpoint_callback = CheckpointCallback(save_freq=int( validate_every_timesteps / num_of_envs), save_path=log_relative_path, name_prefix='model') model = SAC(MlpPolicy, env, verbose=1, policy_kwargs=policy_kwargs, **sac_config, seed=seed_num) model.learn(total_timesteps=total_time_steps, tb_log_name="sac", callback=checkpoint_callback) return
def train_SAC(env, out_dir, seed=None, **kwargs): # Logs will be saved in log_dir/monitor.csv global output_dir output_dir = out_dir log_dir = os.path.join(out_dir, 'log') os.makedirs(log_dir, exist_ok=True) env = make_mujoco_env(env, 0) env = Monitor(env, log_dir + "/") continue_train = False if continue_train: # Continue training print("Loading pretrained agent") model = SAC.load(os.path.join(out_dir, 'final_model.pkl'), env=env, tensorboard_log=os.path.join(log_dir, 'tb'), verbose=1, **kwargs) else: model = SAC( policy, env, #action_noise=action_noise, verbose=1, tensorboard_log=os.path.join(log_dir, 'tb'), full_tensorboard_log=False, **kwargs) model.learn(total_timesteps=n_timesteps, seed=seed, callback=callback, log_interval=10) return model
def run_experiment(verbose, tensorboard_log, learning_rate): pdb.set_trace() env = make_vec_env( 'PointMassDense-%d-v1' % num_objs, 1, wrapper_class=FlattenDictWrapper, wrapper_env_kwargs=['observation', 'achieved_goal', 'desired_goal']) env = VecVideoRecorder( env, osp.join(logger, "videos"), record_video_trigger=lambda x: x % save_video_interval == 0, video_length=save_video_length) n_actions = env.action_space.shape[-1] stddev = 0.2 action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) model = SAC( MlpPolicy, env, verbose=verbose, tensorboard_log=logger, learning_rate=learning_rate, action_noise=action_noise, ) model.learn(total_timesteps=int(nIter), log_interval=100) model.save(expDir + "/%s/%s_%s" % (name, np.format_float_scientific(nIter), np.format_float_scientific(learning_rate))) env.close()
def main(argv): fixed = True policy_name = "sac_reaching_policy" obj_pose_rnd_std = 0 if fixed == True else 0.05 pandaenv = pandaReachGymEnv(renders=True, use_IK=0, numControlledJoints=7, obj_pose_rnd_std=obj_pose_rnd_std, includeVelObs=True) n_actions = pandaenv.action_space.shape[-1] pandaenv = DummyVecEnv([lambda: pandaenv]) model = SAC(MlpPolicy, pandaenv, gamma=0.9, batch_size=16, verbose=1, tensorboard_log="../pybullet_logs/pandareach_sac/") model.learn(total_timesteps=1000000) model.save("../pybullet_logs/pandareach_sac/" + policy_name) del model # remove to demonstrate saving and loading
def model_training_learning(env_train, model_name, timesteps=100000): # train model os.chdir("./model_saved/" + model_name) start = time.time() print("Train ", model_name, " Model with MlpPolicy: ") if model_name == "A2C_Model": model = A2C('MlpPolicy', env_train, verbose=0) elif model_name == "PPO_Model": model = PPO2('MlpPolicy', env_train, verbose=0) elif model_name == "TD3_Model": model = TD3('MlpPolicy', env_train, verbose=0) elif model_name == "SAC_Model": model = SAC('MlpPolicy', env_train, verbose=0) print("Learning ", model_name, " time steps: ", timesteps) model.learn(total_timesteps=timesteps) print("TD3 Model learning completed: ") end = time.time() timestamp = time.strftime('%b-%d-%Y_%H%M') model_file_name = (model_name + timestamp) model.save(model_file_name) print("- ", model_name, " save finish :") print("Training time ", model_name, " : ", (end - start) / 60, " minutes") os.chdir("./..") os.chdir("./..") return model
def func_run(env, logger, lr, action_noise, file): expDir = '/home/shivanik/lab/pointExp/state/' num_objs = 1 verbose = 1 name = 'sac_%d_0.5' % num_objs nIter = 5e7 save_video_length = 200 save_video_interval = 1000000 env = VecVideoRecorder( env, osp.join(logger, "videos"), record_video_trigger=lambda x: x % save_video_interval == 0, video_length=save_video_length) model = SAC( MlpPolicy, env, verbose=verbose, tensorboard_log=logger, learning_rate=lr, action_noise=action_noise, ) model.learn(total_timesteps=int(nIter), log_interval=100) exp_name = expDir + "/%s/%s_%s" % (name, np.format_float_scientific(nIter), np.format_float_scientific(lr)) model.save(exp_name) file.write(exp_name + '\n') env.close() return True
def train(): machine = StateMachine() machine.initialize(headless=True) camera = Camera(machine) env = CustomEnv(machine, camera, state="vision") model = SAC(CnnPolicy, env, verbose=1, learning_starts=32, batch_size=32, \ target_update_interval=32, tensorboard_log=dir_path+'/Logs/') model.learn(total_timesteps=2000, log_interval=1000000) model.save("Grasp_Model_Full_Pose")
def train(env_name, num_time_steps, policy_kwargs, eval_ep, eval_freq, ckpt_freq, load_model=None): env = gym.make(env_name) # env.render() env_ = gym.make(env_name) today = date.today() today = str(today).replace('-', '_') now = datetime.now() current_time = now.strftime("%H_%M_%S") model_name = env_name + '_SAC_' + today + current_time Path('./run/' + model_name).mkdir(parents=True, exist_ok=True) path = os.path.join(os.path.dirname(__file__), './run/' + model_name) env = Monitor(env, filename=path) ############################ # Logging # ############################ logger.configure(path) config = {} config['load'] = [{'load_model': load_model}] config['eval'] = [{'eval_freq': eval_freq, 'eval_ep': eval_ep}] config['ckpt'] = [{'ckpt_freq': ckpt_freq}] config['policy'] = [{'policy_network': policy_kwargs}] with open('./run/' + model_name + '/' + model_name + '.txt', 'w+') as outfile: json.dump(config, outfile, indent=4) ############################ # callback # ############################ callbacklist = [] ckpt_callback = CheckpointCallback(save_freq=ckpt_freq, save_path='./run/' + model_name + '/ckpt', name_prefix='') eval_callback = EvalCallback_wandb_SAC(env_, n_eval_episodes=eval_ep, eval_freq=eval_freq, log_path=path) callbacklist.append(ckpt_callback) callbacklist.append(eval_callback) callback = CallbackList(callbacklist) ############################ # run # ############################ # policy_kwargs = dict(net_arch=[128, dict(vf=[256], pi=[16])]) model = SAC(MlpPolicy, env, verbose=1) model.learn(total_timesteps=int(num_time_steps), log_interval=20, callback=callback) model.save(path + "SAC_Walker2d")
def train(learning_rate, time_steps, env, model_path): tf.reset_default_graph() # to avoid the conflict the existnat parameters, but not suggested for reuse parameters # default policy is MlpPolicy model = SAC(CustomSACPolicy, env, verbose=1,seed=10, n_cpu_tf_sess=16) model.learn(total_timesteps=int(time_steps), log_interval=1000, callback=callback) model.save(model_path)
def train_SAC(env_train, model_name, timesteps=50000): start = time.time() model = SAC('MlpPolicy', env_train, verbose=0) model.learn(total_timesteps=timesteps) end = time.time() model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}") print('Training time (SAC): ', (end - start) / 60, ' minutes') return model
def sac(env_id, timesteps, policy="MlpPolicy", log_interval=None, tensorboard_log=None, seed=None): env = gym.make(env_id) model = SAC(policy, env, verbose=1, tensorboard_log=tensorboard_log) model.learn(total_timesteps=timesteps, log_interval=log_interval) save_model_weights(model, "sac", env_id, policy, seed)
def train(): set_gpu() expDir = '/home/shivanik/lab/pointExp/state/' num_objs = 1 verbose = 1 name = 'sac_%d_0.5' % num_objs nIter = 1e8 save_video_length = 200 save_video_interval = 1000000 file = open('sac_done.txt', 'w+') env = make_vec_env( 'PointMassDense-%d-v1' % num_objs, 1, wrapper_class=FlattenDictWrapper, wrapper_env_kwargs=['observation', 'achieved_goal', 'desired_goal']) n_actions = env.action_space.shape[-1] stddev = 0.2 pool = multiprocessing.Pool(processes=4) for lr in [1e-5]: #, 5e-4, 1e-5 logger = osp.join( expDir, name, 'logs%s_%s' % (np.format_float_scientific(nIter), np.format_float_scientific(lr))) env = VecVideoRecorder( env, osp.join(logger, "videos"), record_video_trigger=lambda x: x % save_video_interval == 0, video_length=save_video_length) action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) # boo = pool.apply_async(func_run, args=(env, logger, lr, action_noise, file)) model = SAC( MlpPolicy, env, verbose=verbose, tensorboard_log=logger, learning_rate=lr, action_noise=action_noise, ) model.learn(total_timesteps=int(nIter), log_interval=100) exp_name = expDir + "/%s/%s_%s" % (name, np.format_float_scientific(nIter), np.format_float_scientific(lr)) model.save(exp_name) file.write(exp_name + '\n') env.close() file.close() pool.close() pool.join()
def main(): parser = argparse.ArgumentParser("Insertion, Manual mode") parser.add_argument('--host', default="127.0.0.1", type=str, help='IP of the server') parser.add_argument( '--port', default=9081, type=int, help='Port that should be used to connect to the server') parser.add_argument('--save', action="store_true", help=('Saves checkpoints')) parser.add_argument( '--use_coord', action="store_true", help=('If set, the environment\'s observation space will be' 'coordinates instead of images')) args = parser.parse_args() os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' env = gym.make('insertion-v0', kwargs={ 'host': args.host, "port": args.port, "use_coord": args.use_coord }) # check_env(env, warn=True) print(f"Observation space: {env.observation_space}") print(f"Action space: {env.action_space}") # print(env.action_space.sample()) # Save a checkpoint every 50000 steps ckpt = CkptCallback(save_freq=50000, save_path='../checkpoints/', name_prefix='rl_insertion') if args.save else None if args.use_coord: model = SAC('MlpPolicy', env, verbose=1, tensorboard_log="../insertion_tensorboard/") else: model = SAC('CnnPolicy', env, verbose=1, tensorboard_log="../insertion_tensorboard/") model.learn(50001, callback=ckpt)
def train_sac(training_tag): env = gym.make(ENVIRONMENT_NAME) env = DummyVecEnv([lambda: env]) if (isinstance(training_tag, float)): model = SAC(sac_MlpPolicy, env, ent_coef=training_tag, verbose=1, policy_kwargs=POLICY_KWARGS) for step in range(TRAINING_STEPS): env.reset() (model, learning_results) = model.learn( total_timesteps=TRAINING_TIMESTEPS, log_interval=100) file_tag = str(training_tag).replace(".", "p") if (SAVE_AGENTS): model.save("nchain/models/SAC_" + ENVIRONMENT_NAME + "_s" + str(step) + "_t" + str(file_tag) + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_TIMESTEPS)) if (SAVE_FINAL_AGENT): model.save("nchain/models/SAC_" + ENVIRONMENT_NAME + "_t" + str(file_tag) + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_STEPS * TRAINING_TIMESTEPS)) env.reset() del model return data
def train_GAIL(env_train, model_name, timesteps=1000): """GAIL Model""" #from stable_baselines.gail import ExportDataset, generate_expert_traj start = time.time() # generate expert trajectories model = SAC('MLpPolicy', env_train, verbose=1) generate_expert_traj(model, 'expert_model_gail', n_timesteps=100, n_episodes=10) # Load dataset dataset = ExpertDataset(expert_path='expert_model_gail.npz', traj_limitation=10, verbose=1) model = GAIL('MLpPolicy', env_train, dataset, verbose=1) model.learn(total_timesteps=1000) end = time.time() model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}") print('Training time (PPO): ', (end - start) / 60, ' minutes') return model
def train_SAC(env, out_dir, seed=None, **kwargs): # Logs will be saved in log_dir/monitor.csv global output_dir output_dir = out_dir log_dir = os.path.join(out_dir, 'log') os.makedirs(log_dir, exist_ok=True) env = gym.make(env) env = Monitor(env, log_dir + '/', allow_early_resets=True) # Delete keys so the dict can be pass to the model constructor # policy = kwargs['policy'] policy = 'MlpPolicy' # n_timesteps = kwargs['n_timesteps'] n_timesteps = int(1e6) noise_type = None # Add some param noise for exploration param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1, desired_action_stddev=0.1) continue_model = False if continue_model is True: # Continue training print("Loading pretrained agent") model = SAC.load(os.path.join(out_dir, 'final_model.pkl'), env=env, tensorboard_log=os.path.join(log_dir, 'tb'), verbose=1, **kwargs) else: model = SAC( policy, env, # action_noise=param_noise, verbose=1, tensorboard_log=os.path.join(log_dir, 'tb'), full_tensorboard_log=False, **kwargs) model.learn(total_timesteps=n_timesteps, seed=seed, callback=callback, log_interval=10) return model
def sac(env_id, timesteps, policy="MlpPolicy", log_interval=None, tensorboard_log=None, seed=None, load_weights=None): env = gym.make(env_id) if load_weights is not None: model = SAC.load(load_weights, env, verbose=0) else: model = SAC(policy, env, verbose=1, tensorboard_log=tensorboard_log) callback = WandbRenderEnvCallback(model_name="sac", env_name=env_id) model.learn(total_timesteps=timesteps, log_interval=log_interval, callback=callback)
def train_SAC(env_train, model_name, timesteps=100000): # train SAC model os.chdir("./model_saved/") start = time.time() print("Train SAC Model with MlpPolicy: ") model = SAC('MlpPolicy', env_train, verbose=0) print("SAC Learning time steps: ", timesteps) model.learn(total_timesteps=timesteps) print("SAC Model learning completed: ") end = time.time() timestamp = time.strftime('%b-%d-%Y_%H%M') model_file_name = (model_name + timestamp) model.save(model_file_name) print("SAC Model save finish :") print('Training time SAC: ', (end - start) / 60, ' minutes') os.chdir("./..") return model
def train_SAC(self, model_name, model_params=config.SAC_PARAMS): """TD3 model""" from stable_baselines import SAC env_train = self.env start = time.time() model = SAC( 'MlpPolicy', env_train, batch_size=model_params['batch_size'], buffer_size=model_params['buffer_size'], learning_rate=model_params['learning_rate'], learning_starts=model_params['learning_starts'], ent_coef=model_params['ent_coef'], verbose=model_params['verbose'], tensorboard_log=f"{config.TENSORBOARD_LOG_DIR}/{model_name}") model.learn(total_timesteps=model_params['timesteps'], tb_log_name="SAC_run") end = time.time() model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}") print('Training time (SAC): ', (end - start) / 60, ' minutes') return model
def test_agent(agent_step): now = time.time() for coef_index in range(len(CLAC_COEFS)): mut_coef = CLAC_COEFS[coef_index] ent_coef = SAC_COEFS[coef_index] training_timestep = 0 clac_env = gym.make(ENVIRONMENT_NAME) clac_env = DummyVecEnv([lambda: clac_env]) clac_model = CLAC(CLAC_MlpPolicy, clac_env, mut_inf_coef=mut_coef, verbose=1) sac_env = gym.make(ENVIRONMENT_NAME) sac_env = DummyVecEnv([lambda: sac_env]) sac_model = SAC(MlpPolicy, sac_env, ent_coef=ent_coef, verbose=1) mirl_env = gym.make(ENVIRONMENT_NAME) mirl_env = DummyVecEnv([lambda: mirl_env]) mirl_model = CLAC(CLAC_MlpPolicy, mirl_env, mut_inf_coef=mut_coef, coef_schedule=3.3e-3, verbose=1) for resample_step in range(0, NUM_RESAMPLES): features = pd.DataFrame() if(agent_step == 1): print(mut_coef, " ", ent_coef, " ", NUM_TRAINING_STEPS, " ", ENVIRONMENT_NAME, " ", FOLDER, " ", resample_step) (clac_model, learning_results) = clac_model.learn(total_timesteps=NUM_TRAINING_STEPS, log_interval=1000) (sac_model, learning_results) = sac_model.learn(total_timesteps=NUM_TRAINING_STEPS, log_interval=1000) (mirl_model, learning_results) = mirl_model.learn(total_timesteps=NUM_TRAINING_STEPS, log_interval=1000) # Save models clac_model.save(FOLDER + "/Training/models/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step)) sac_model.save(FOLDER + "/Training/models/CLAC_" + str(ent_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step)) mirl_model.save(FOLDER + "/Training/models/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step)) training_timestep += NUM_TRAINING_STEPS # Test Normal eval_results = eval_model(clac_model, clac_env, "CLAC", mut_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 0) eval_results.to_pickle(FOLDER + "/Training/results/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl") eval_results = eval_model(sac_model, sac_env, "SAC", ent_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 0) eval_results['AgentID'] = agent_step eval_results.to_pickle(FOLDER + "/Training/results/SAC_" + str(ent_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl") eval_results = eval_model(mirl_model, mirl_env, "MIRL", mut_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 0) eval_results['AgentID'] = agent_step eval_results.to_pickle(FOLDER + "/Training/results/MIRL_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl") # Test generalization eval_results = eval_model(clac_model, clac_env, "CLAC", mut_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 1) eval_results['AgentID'] = agent_step eval_results.to_pickle(FOLDER + "/Generalization/results/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl") eval_results = eval_model(sac_model, sac_env, "SAC", ent_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 1) eval_results['AgentID'] = agent_step eval_results.to_pickle(FOLDER + "/Generalization/results/SAC_" + str(ent_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl") eval_results = eval_model(mirl_model, mirl_env, "MIRL", mut_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 1) eval_results['AgentID'] = agent_step eval_results.to_pickle(FOLDER + "/Generalization/results/MIRL_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl") # Test generalization Extreme eval_results = eval_model(clac_model, clac_env, "CLAC", mut_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 2) eval_results['AgentID'] = agent_step eval_results.to_pickle(FOLDER + "/Extreme/results/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl") eval_results = eval_model(sac_model, sac_env, "SAC", ent_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 2) eval_results['AgentID'] = agent_step eval_results.to_pickle(FOLDER + "/Extreme/results/SAC_" + str(ent_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl") eval_results = eval_model(mirl_model, mirl_env, "MIRL", mut_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 2) eval_results['AgentID'] = agent_step eval_results.to_pickle(FOLDER + "/Extreme/results/MIRL_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl") clac_env.env_method("reset_features") sac_env.env_method("reset_features") mirl_env.env_method("reset_features") del sac_model del sac_env del clac_model del clac_env del mirl_model del mirl_env later = time.time() difference = int(later - now) print("Tested Agent Time: ", difference)
import gym import rlbench.gym from stable_baselines.sac.policies import MlpPolicy from stable_baselines import SAC import os dir_path = os.path.dirname(os.path.realpath(__file__)) env = gym.make("empty_container-state-v0",render_mode="human",observation_mode='vision') model = SAC(MlpPolicy, env, verbose=1, tensorboard_log=dir_path+'/Logs/') model.learn(total_timesteps=1000) model.save("sac_ec")
import gym import numpy as np import imageio from stable_baselines.sac.policies import MlpPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines import SAC env = gym.make('Pendulum-v0') env = DummyVecEnv([lambda: env]) model = SAC(MlpPolicy, env, verbose=1) model.learn(total_timesteps=1000, log_interval=10) model.save("../models/sac_pendulum") del model # remove to demonstrate saving and loading model = SAC.load("../models/sac_pendulum") #obs = env.reset() #while True: # action, _states = model.predict(obs) # obs, rewards, dones, info = env.step(action) # env.render()
step += 1 if (dones): obs = env.reset() break #print(rewards) #env.render() print("Initialization evaluation :{}, steps :{}".format(cum_reward, step)) ## Evaluation complete of intialization print("Starting Experiment with seed: {}".format(seed)) #model = PPO2(MlpPolicy, env,verbose=True) model.learn(total_timesteps=1000000, use_action_repeat=False, poisson=False, callback=callback, only_explore_with_act_rep=False) # f.close() # json = json.dumps(log_data) # f = open(log_dir+"log_data.json","w") # f.write(json) # f.close() np.save(log_dir + "log_data.npy", log_data) # Don't forget to save the VecNormalize statistics when saving the agent # log_dir = "logs/hopper_aneal/" # model.save(log_dir + "sac_hopper") #env.save(os.path.join(log_dir, "vec_normalize.pkl"))
class DriveAgent: """ Python 3. The rest of the files are in Python 2. """ def __init__(self): logger.info(os.getcwd()) #self._interpreter = Interpreter("./converted_model.tflite") #self._interpreter.allocate_tensors() #print(self._interpreter.get_input_details()) #print(self._interpreter.get_output_details()) #_, self._input_height, self._input_width, _ = self._interpreter.get_input_details()[0]['shape'] self.env = AutoDriftEnv(const_throttle=0.3) # self.model = SacModel(policy=CnnPolicy, env=self.env) self.model = SAC(policy=CnnPolicy, env=self.env) # self._input_height = IMAGE_HEIGHT # self._input_width = IMAGE_WIDTH # print(self._input_height) # print(self._input_width) # self._socket = socket.socket() # socket_addr = ('127.0.0.1', 8888) # UNCOMMENT THIS #self._socket.connect(socket_addr) self.main() def main(self): try: # Save a checkpoint every 1000 steps # https://github.com/hill-a/stable-baselines/blob/master/stable_baselines/common/callbacks.py checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs', name_prefix='rl_model', verbose=2) self.model.learn(total_timesteps=5000, log_interval=4, callback=checkpoint_callback) # input_details = self._interpreter.get_input_details() # output_details = self._interpreter.get_output_details() # with picamera.PiCamera(resolution=(640, 480), framerate=30) as camera: # # camera.vflip = True # # camera.start_preview() # try: # stream = io.BytesIO() # for _ in camera.capture_continuous(stream, format='jpeg', use_video_port=True): # stream.seek(0) # image = Image.open(stream).convert('RGB').resize((self._input_width, self._input_height), Image.ANTIALIAS) # start_time = time.time() # # img = np.asarray(image) # img = img[np.newaxis, ...] # what's this for? # input_data = np.array(img, dtype=np.float32) # print(input_data.shape) # print(input_data) # #self._interpreter.set_tensor(input_details[0]['index'], input_data) # # #self._interpreter.invoke() # #output_data = self._interpreter.get_tensor(output_details[0]['index'])[0] # # TEMP FIX # output_data = None # # time_taken_ms = (time.time() - start_time) * 1000 # # print(f'output_data:{output_data}, time_taken:{time_taken_ms}ms') # # camera.annotate_text = str(output_data) + ", " + str(time_taken_ms) # stream.seek(0) # stream.truncate() # # data = [] # data.append(output_data) # data_string = pickle.dumps(data, protocol=1) # self._socket.send(data_string) # except KeyboardInterrupt: print("DriveAgent: Ctrl-C") finally: # camera.stop_preview() self.env.close() print("DriveAgent: environment closed, done")
import gym_reacher from stable_baselines import SAC from stable_baselines.common.callbacks import CheckpointCallback # Save a checkpoint every 1000 steps checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='../results/tests/logs/', name_prefix='rl_model') model = SAC('MlpPolicy', 'Reacher3Dof-v0') model.learn(2000, callback=checkpoint_callback)
def train_initial_policy( model_name, algo=ALGO, env_name=ENV_NAME, time_steps=TIME_STEPS): """Uses the specified algorithm on the target environment""" print("Using algorithm : ", algo.__name__) print("Model saved as : ", "data/models/" +algo.__name__+"_initial_policy_"+env_name+"_.pkl") # define the environment here env = gym.make(env_name) env.seed(SEED) if NOISE_VALUE>0 : env = NoisyRealEnv(env, noise_value=NOISE_VALUE) if MUJOCO_NORMALIZE: env = MujocoNormalized(env) print('~~ ENV Obs RANGE : ', env.observation_space.low, env.observation_space.high) print('~~~ ENV Action RANGE : ', env.action_space.low, env.action_space.high) if algo.__name__ == "ACKTR": print('Using SubprovVecEnv') env = SubprocVecEnv([lambda: env for i in range(8)]) elif algo.__name__ == "SAC": print('Using standard gym environment') env = env else: print('Using Dummy Vec Env') env = DummyVecEnv([lambda : env]) if NORMALIZE : env = VecNormalize(env, training=True, norm_obs=True, norm_reward=False, clip_reward=1e6, ) with open('data/target_policy_params.yaml') as file: args = yaml.load(file, Loader=yaml.FullLoader) args = args[algo.__name__][PARAMS_ENV] print('~~ Loaded args file ~~') if algo.__name__ == "SAC": print('Initializing SAC with RLBaselinesZoo hyperparameters .. ') print('using 256 node architecture as in the paper') class CustomPolicy(ffp_sac): def __init__(self, *args, **kwargs): super(CustomPolicy, self).__init__(*args, **kwargs, feature_extraction="mlp", layers=[256, 256]) model = SAC(CustomPolicy, env, verbose=1, tensorboard_log='data/TBlogs/initial_policy_training', batch_size=args['batch_size'], buffer_size=args['buffer_size'], ent_coef=args['ent_coef'], learning_starts=args['learning_starts'], learning_rate=args['learning_rate'], train_freq=args['train_freq'], seed=SEED, ) elif algo.__name__ == "TD3": print('Initializing TD3 with RLBaselinesZoo hyperparameters .. ') # hyperparameters suggestions from : # https://github.com/araffin/rl-baselines-zoo/blob/master/trained_agents/td3/HopperBulletEnv-v0/config.yml n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=float(args['noise_std']) * np.ones(n_actions)) class CustomPolicy2(ffp_td3): def __init__(self, *args, **kwargs): super(CustomPolicy2, self).__init__(*args, **kwargs, feature_extraction="mlp", layers=[400, 300]) model = TD3(CustomPolicy2, env, verbose = 1, tensorboard_log = 'data/TBlogs/initial_policy_training', batch_size = args['batch_size'], buffer_size = args['buffer_size'], gamma = args['gamma'], gradient_steps = args['gradient_steps'], learning_rate = args['learning_rate'], learning_starts = args['learning_starts'], action_noise = action_noise, train_freq=args['train_freq'], seed=SEED, ) elif algo.__name__ == "TRPO": print('Initializing TRPO with RLBaselinesZoo hyperparameters .. ') # hyperparameters suggestions from : # https://github.com/araffin/rl-baselines-zoo/blob/master/trained_agents/sac/HopperBulletEnv-v0/config.yml model = TRPO(mlp_standard, env, verbose=1, tensorboard_log='data/TBlogs/initial_policy_training', timesteps_per_batch=args['timesteps_per_batch'], lam=args['lam'], max_kl=args['max_kl'], gamma=args['gamma'], vf_iters=args['vf_iters'], vf_stepsize=args['vf_stepsize'], entcoeff=args['entcoeff'], cg_damping=args['cg_damping'], cg_iters=args['cg_iters'], seed=SEED, ) elif algo.__name__ == "ACKTR": print('Initializing ACKTR') model = ACKTR(mlp_standard, env, verbose=1, n_steps=128, ent_coef=0.01, lr_schedule='constant', learning_rate=0.0217, max_grad_norm=0.5, gamma=0.99, vf_coef=0.946, seed=SEED) elif algo.__name__ == "PPO2": print('Initializing PPO2') print('Num envs : ', env.num_envs) model = PPO2(mlp_standard, env, n_steps=int(args['n_steps']/env.num_envs), nminibatches=args['nminibatches'], lam=args['lam'], gamma=args['gamma'], ent_coef=args['ent_coef'], noptepochs=args['noptepochs'], learning_rate=args['learning_rate'], cliprange=args['cliprange'], verbose=1, tensorboard_log='data/TBlogs/initial_policy_training', seed=SEED, ) else: print('No algorithm matched. Using SAC .. ') model = SAC(CustomPolicy, env, verbose=1, batch_size=args['batch_size'], buffer_size=args['buffer_size'], ent_coef=args['ent_coef'], learning_starts=args['learning_starts'], learning_rate=args['learning_rate'], train_freq=args['train_freq'], seed=SEED, ) # change model name if using normalization if NORMALIZE: model_name = model_name.replace('.pkl', 'normalized_.pkl') elif MUJOCO_NORMALIZE: model_name = model_name.replace('.pkl', 'mujoco_norm_.pkl') if SAVE_BEST_FOR_20: model.learn(total_timesteps=time_steps, tb_log_name=model_name, log_interval=10, callback=eval_callback) save_the_model() model_name = model_name.replace('best_', '') model.save(model_name) elif SAVE_INTERMEDIATE: check_callback = CheckpointCallback(save_freq=SAVE_FREQ, save_path=model_name[:-4], name_prefix=ENV_NAME + '_' + str(SEED), verbose=1, ) eval_env = DummyVecEnv([lambda: gym.make(ENV_NAME)]) eval_env.seed(SEED) eval_callback = EvalCallback(eval_env, n_eval_episodes=10, eval_freq=SAVE_FREQ, log_path=model_name[:-4], deterministic=False, render=False, verbose=1) callbacks = CallbackList([check_callback, eval_callback]) model.learn(total_timesteps=time_steps, tb_log_name=model_name.split('/')[-1], log_interval=10, callback=callbacks) model.save(model_name) npzfile = np.load(model_name[:-4] + '/evaluations.npz') average_rewards = np.mean(npzfile['results'], axis=1)[:, 0] with open(model_name[:-4] + "/eval_results.txt", "a") as f: for i in range(np.shape(average_rewards)[0]): f.write("{}, {}\n".format(npzfile['timesteps'][i], average_rewards[i])) evaluate_policy_on_env(env, model, render=False, iters=50) else: model.learn(total_timesteps=time_steps, tb_log_name=model_name.split('/')[-1], log_interval=10,) model.save(model_name) evaluate_policy_on_env(env, model, render=False, iters=50) # save the environment params if NORMALIZE: # env.save(model_name.replace('.pkl', 'stats_.pkl')) env.save('data/models/env_stats/'+env_name+'.pkl') print('done :: ', model_name) exit()
# env = VecNormalize(env, norm_obs=True, norm_reward=False, # clip_obs=10.) env = Monitor(env.envs[0], log_dir, allow_early_resets=True) #env.act_rep = 20 # print(env.observation_space) low = np.full((12, ), -float('inf')) high = np.full((12, ), float('inf')) space = spaces.Box(low, high, dtype=low.dtype) env.observation_space = space print(env.observation_space) # exit() model = SAC(MlpPolicy, env, verbose=1) #model = PPO2(MlpPolicy, env,verbose=True) model.learn(total_timesteps=1000000, poisson=True, callback=callback) f.close() # json = json.dumps(log_data) # f = open(log_dir+"log_data.json","w") # f.write(json) # f.close() np.save(log_dir + "log_data.npy", log_data) # Don't forget to save the VecNormalize statistics when saving the agent # log_dir = "logs/hopper_aneal/" # model.save(log_dir + "sac_hopper") #env.save(os.path.join(log_dir, "vec_normalize.pkl"))
verbose=1, tensorboard_log=tensorboard_log_dir) if train: for i in range(model_num): model.learn(total_timesteps=total_timesteps_, tb_log_name=tensorboard_log_name) model.save(model_save_name) elif algorithm == "SAC": from stable_baselines.sac.policies import MlpPolicy from stable_baselines import SAC env = gym.make(env_name) model = SAC(MlpPolicy, env, verbose=1, tensorboard_log=tensorboard_log_dir) if train: for i in range(model_num): model.learn(total_timesteps=total_timesteps_, log_interval=1, tb_log_name=tensorboard_log_name) model.save(model_save_name) elif algorithm == "DDPG": if train: for i in range(model_num): from stable_baselines.ddpg.policies import MlpPolicy from stable_baselines.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise, AdaptiveParamNoiseSpec from stable_baselines import DDPG env = gym.make(env_name) # the noise objects for DDPG n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(n_actions),
def callback(_locals, _globals): global best_mean_reward, n_steps mean_reward = float("-inf") if n_steps % 1000 == 0 and n_steps != 0: x, y = ts2xy(load_results(log_dir), 'timesteps') if len(x) > 0: mean_reward = np.mean(y[-100:]) print(x[-1], 'timesteps') print("Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(best_mean_reward, mean_reward)) if mean_reward > best_mean_reward: best_mean_reward = mean_reward print("Saving new best model") _locals['self'].save(log_dir + 'best_model.pkl') n_steps += 1 return True log_dir = "tmp/" os.makedirs(log_dir, exist_ok=True) env = Monitor(env, log_dir, allow_early_resets=True) time_steps=1e6 model = SAC(MlpPolicy, env, verbose=0) model.learn(total_timesteps=int(time_steps), callback=callback) #results_plotter.plot_results([log_dir], time_steps, results_plotter.X_TIMESTEPS, "Hopper") x, y = ts2xy(load_results(log_dir), "timesteps") #results_plotter.plot_results([log_dir], time_steps, y, "Hopper") plt.plot(x,y) #plt.show() plt.savefig("hopper_sac_default")