def __init__(self, session, agents=None, **params): self.agents = agents self.params = params self.session = session # Create models extra_env_kwargs = self.params.get("extra_env_kwargs", {}) menv = MakeEnvironment(session, mode=self.params["mode"], seed=self.params["seed"], **self.params["env"]) self.envs = menv.create_envs(num_workers=self.params["num_workers"], extra_env_kwargs=extra_env_kwargs) # self.params["env"]["env_type"] self.state = {} self.state["steps"] = 0 self.state["n_episode"] = 0 self.state["timesteps"] = 0 self.state["was_reset"] = False self.local = {} self.local["steps"] = 0 self.local["n_episode"] = 0 self.monitor_n_episode() self.monitor_timesteps() # We only reset once. Later environments will be reset automatically. self.reset()
def gen_params(cpanel): params = {} # Environment params["env"] = {} params["env"]["name"] = cpanel["model_name"] params["env"]["from_module"] = cpanel.get("from_module", '') params["env"]["from_params"] = cpanel.get("from_params", False) if params["env"]["from_params"]: # For having environment from parameters from digideep.environment.dmc2gym.registration import EnvCreator from dextron.zoo.hand_env.hand import grasp task_kwargs = { "generator_type": cpanel[ "generator_type"], # Algorithm for generating trajectory: simulated/real "generator_args": { "time_scale_offset": cpanel["time_scale_offset"], "time_scale_factor": cpanel["time_scale_factor"], "time_noise_factor": cpanel["time_noise_factor"], "time_staying_more": cpanel["time_staying_more"], # timesteps "extracts_path": cpanel["extracts_path"], "database_filename": cpanel["database_filename"] }, "random": None, "pub_cameras": PUB_CAMERAS, "reward_type": "reward/20", "controller_gain": cpanel["controller_gain"] } # visualize_reward=True environment_kwargs = { "time_limit": cpanel["time_limit"], "control_timestep": cpanel["control_timestep"] } params["env"]["register_args"] = { "id": cpanel["model_name"], "entry_point": "digideep.environment.dmc2gym.wrapper:DmControlWrapper", "kwargs": { 'dmcenv_creator': EnvCreator(grasp, task_kwargs=task_kwargs, environment_kwargs=environment_kwargs, visualize_reward=True), 'flat_observation': False, 'observation_key': "agent" } } ############################################## ### Normal Wrappers ### ####################### norm_wrappers = [] # Converting observation to 1 level # if not PUB_CAMERAS: # norm_wrappers.append(dict(name="digideep.environment.wrappers.normal.WrapperLevelDictObs", # args={"path":cpanel["observation_key"], # }, # enabled=True)) # Normalizing actions (to be in [-1, 1]) norm_wrappers.append( dict( name= "digideep.environment.wrappers.normalizers.WrapperNormalizeActDict", args={"paths": ["agent"]}, enabled=False)) ############################################## ### Vector Wrappers ### ####################### vect_wrappers = [] # Normalizing rewards vect_wrappers.append( dict( name="digideep.environment.wrappers.normalizers.VecNormalizeRew", args={ "clip": 5, # 10 "gamma": cpanel["gamma"], "epsilon": 1e-8 }, enabled=False)) # Not a good idea to normalize sparse rewards. # Log successful parameter sets for the expert policy vect_wrappers.append( dict( name="dextron.wrappers.success_logger.VecSuccessLogger", request_for_args=["session_state"], args={ "threshold": cpanel["reward_threshold"], # Remove only zero-rewards "interval": 100, # How many episodes to print the log report? "num_workers": cpanel["num_workers"], "info_keys": ["/rand"], "obs_keys": ["/parameters"] }, enabled=True)) ############################################## params["env"]["main_wrappers"] = { "Monitor": { "allow_early_resets": True, # We need it to allow early resets in the test environment. "reset_keywords": (), "info_keywords": () }, "WrapperDummyMultiAgent": { "agent_name": "agent" }, "WrapperDummyDictObs": { "observation_key": "agent" } } params["env"]["norm_wrappers"] = norm_wrappers params["env"]["vect_wrappers"] = vect_wrappers menv = MakeEnvironment(session=None, mode=None, seed=1, **params["env"]) params["env"]["config"] = menv.get_config() ##################################### # Runner: [episode < cycle < epoch] # ##################################### params["runner"] = {} params["runner"]["name"] = cpanel.get("runner_name", "digideep.pipeline.Runner") params["runner"]["max_time"] = cpanel.get("max_exec_time", None) params["runner"]["max_iter"] = cpanel.get("max_exec_iter", None) params["runner"]["n_cycles"] = cpanel[ "epoch_size"] # Meaning that 100 cycles are 1 epoch. params["runner"]["n_epochs"] = cpanel[ "number_epochs"] # Testing and savings are done after each epoch. params["runner"]["randargs"] = { 'seed': cpanel["seed"], 'cuda_deterministic': cpanel["cuda_deterministic"] } params["runner"]["test_act"] = cpanel["test_activate"] # Test Activate params["runner"]["test_int"] = cpanel["test_interval"] # Test Interval params["runner"]["save_int"] = cpanel["save_interval"] # Save Interval params["agents"] = {} ############################################## ### Agent (#1) ### Demonstrator ################## params["agents"]["demonstrator"] = {} params["agents"]["demonstrator"]["name"] = "demonstrator" params["agents"]["demonstrator"][ "type"] = "dextron.agent.demonstrator.NaiveController" params["agents"]["demonstrator"]["methodargs"] = {} agent_name = params["agents"]["demonstrator"]["name"] params["agents"]["demonstrator"]["methodargs"]["act_space"] = params[ "env"]["config"]["action_space"][agent_name] ############################################## # ############################################## # ### Memory ### # ############## params["memory"] = {} ############################################## ### Explorer ### ################ params["explorer"] = {} params["explorer"]["train"] = {} params["explorer"]["train"]["mode"] = "train" params["explorer"]["train"]["env"] = params["env"] params["explorer"]["train"]["do_reset"] = False params["explorer"]["train"]["final_action"] = False params["explorer"]["train"]["warm_start"] = 0 params["explorer"]["train"]["num_workers"] = cpanel["num_workers"] params["explorer"]["train"][ "deterministic"] = False # MUST: Takes random actions params["explorer"]["train"]["n_steps"] = cpanel[ "n_steps"] # Number of steps to take a step in the environment params["explorer"]["train"][ "n_episodes"] = None # Do not limit # of episodes params["explorer"]["train"][ "win_size"] = 20 # Number of episodes to episode reward for report params["explorer"]["train"]["render"] = False params["explorer"]["train"]["render_delay"] = 0 params["explorer"]["train"]["seed"] = cpanel["seed"] + 90 params["explorer"]["train"]["extra_env_kwargs"] = { "mode": params["explorer"]["train"]["mode"], "allow_demos": False } params["explorer"]["test"] = {} params["explorer"]["test"]["mode"] = "test" params["explorer"]["test"]["env"] = params["env"] params["explorer"]["test"]["do_reset"] = True params["explorer"]["test"]["final_action"] = False params["explorer"]["test"]["warm_start"] = 0 params["explorer"]["test"]["num_workers"] = cpanel[ "num_workers"] # We can use the same amount of workers for testing! params["explorer"]["test"][ "deterministic"] = True # MUST: Takes the best action params["explorer"]["test"]["n_steps"] = None # Do not limit # of steps params["explorer"]["test"]["n_episodes"] = cpanel["test_win_size"] params["explorer"]["test"]["win_size"] = cpanel[ "test_win_size"] # Extra episodes won't be counted params["explorer"]["test"]["render"] = False params["explorer"]["test"]["render_delay"] = 0 params["explorer"]["test"]["seed"] = cpanel[ "seed"] + 100 # We want to make the seed of test environments different from training. params["explorer"]["test"]["extra_env_kwargs"] = { "mode": params["explorer"]["test"]["mode"], "allow_demos": False } params["explorer"]["eval"] = {} params["explorer"]["eval"]["mode"] = "eval" params["explorer"]["eval"]["env"] = params["env"] params["explorer"]["eval"]["do_reset"] = False params["explorer"]["eval"]["final_action"] = False params["explorer"]["eval"]["warm_start"] = 0 params["explorer"]["eval"]["num_workers"] = 1 params["explorer"]["eval"][ "deterministic"] = True # MUST: Takes the best action params["explorer"]["eval"]["n_steps"] = None # Do not limit # of steps params["explorer"]["eval"]["n_episodes"] = 1 params["explorer"]["eval"]["win_size"] = -1 params["explorer"]["eval"]["render"] = True params["explorer"]["eval"]["render_delay"] = 0 params["explorer"]["eval"]["seed"] = cpanel[ "seed"] + 101 # We want to make the seed of eval environment different from test/train. params["explorer"]["eval"]["extra_env_kwargs"] = { "mode": params["explorer"]["eval"]["mode"], "allow_demos": cpanel.get("allow_demos", False) } ############################################## params["explorer"]["demo"] = {} params["explorer"]["demo"]["mode"] = "demo" params["explorer"]["demo"]["env"] = params["env"] params["explorer"]["demo"]["do_reset"] = False params["explorer"]["demo"]["final_action"] = False params["explorer"]["demo"]["warm_start"] = 0 params["explorer"]["demo"]["num_workers"] = cpanel["num_workers"] params["explorer"]["demo"][ "deterministic"] = False # MUST: Takes random actions params["explorer"]["demo"]["n_steps"] = cpanel[ "n_steps"] # Number of steps to take a step in the environment params["explorer"]["demo"]["n_episodes"] = None params["explorer"]["demo"]["win_size"] = -1 params["explorer"]["demo"]["render"] = cpanel["render"] params["explorer"]["demo"]["render_delay"] = 0 params["explorer"]["demo"]["seed"] = cpanel["seed"] + 50 params["explorer"]["demo"]["extra_env_kwargs"] = { "mode": params["explorer"]["demo"]["mode"], "allow_demos": True } return params
def gen_params(cpanel): params = {} # Environment params["env"] = {} params["env"]["name"] = cpanel["model_name"] params["env"]["from_module"] = cpanel.get("from_module", '') params["env"]["from_params"] = cpanel.get("from_params", False) ############################################## ### Normal Wrappers ### ####################### norm_wrappers = [] # Converting observation to 1 level norm_wrappers.append(dict(name="digideep.environment.wrappers.normal.WrapperLevelDictObs", args={"path":cpanel["observation_key"], }, enabled=False)) # Normalizing actions (to be in [-1, 1]) norm_wrappers.append(dict(name="digideep.environment.wrappers.normalizers.WrapperNormalizeActDict", args={"paths":["agent"]}, enabled=True)) ############################################## ### Vector Wrappers ### ####################### vect_wrappers = [] # Normalizing observations vect_wrappers.append(dict(name="digideep.environment.wrappers.normalizers.VecNormalizeObsDict", args={"paths":[cpanel["observation_key"]], "clip":10, "epsilon":1e-8 }, enabled=False)) # Normalizing rewards vect_wrappers.append(dict(name="digideep.environment.wrappers.normalizers.VecNormalizeRew", args={"clip":10, "gamma":cpanel["gamma"], "epsilon":1e-8 }, enabled=False)) ############################################## params["env"]["main_wrappers"] = {"Monitor":{"allow_early_resets":True, # We need it to allow early resets in the test environment. "reset_keywords":(), "info_keywords":()}, "WrapperDummyMultiAgent":{"agent_name":"agent"}, "WrapperDummyDictObs":{"observation_key":"agent"} } params["env"]["norm_wrappers"] = norm_wrappers params["env"]["vect_wrappers"] = vect_wrappers menv = MakeEnvironment(session=None, mode=None, seed=1, **params["env"]) params["env"]["config"] = menv.get_config() # Some parameters # params["env"]["gamma"] = 1-1/params["env"]["config"]["max_steps"] # 0.98 ##################################### # Runner: [episode < cycle < epoch] # ##################################### params["runner"] = {} params["runner"]["name"] = cpanel.get("runner_name", "digideep.pipeline.Runner") params["runner"]["max_time"] = cpanel.get("max_exec_time", None) params["runner"]["max_iter"] = cpanel.get("max_exec_iter", None) params["runner"]["n_cycles"] = cpanel["epoch_size"] # Meaning that 100 cycles are 1 epoch. params["runner"]["n_epochs"] = cpanel["number_epochs"] # Testing and savings are done after each epoch. params["runner"]["randargs"] = {'seed':cpanel["seed"], 'cuda_deterministic':cpanel["cuda_deterministic"]} params["runner"]["test_act"] = cpanel["test_activate"] # Test Activate params["runner"]["test_int"] = cpanel["test_interval"] # Test Interval params["runner"]["save_int"] = cpanel["save_interval"] # Save Interval # We "save" after each epoch is done. # We "test" after each epoch is done. params["agents"] = {} ############################################## ### Agent (#1) ### ################## params["agents"]["agent"] = {} params["agents"]["agent"]["name"] = "agent" params["agents"]["agent"]["type"] = cpanel["agent_type"] params["agents"]["agent"]["observation_path"] = cpanel["observation_key"] params["agents"]["agent"]["methodargs"] = {} params["agents"]["agent"]["methodargs"]["n_update"] = cpanel["n_update"] # Number of times to perform PPO update. Alternative name: PPO_EPOCH params["agents"]["agent"]["methodargs"]["gamma"] = cpanel["gamma"] # Discount factor Gamma # params["agents"]["agent"]["methodargs"]["clamp_return"] = 1/(1-float(cpanel["gamma"])) # print("Clip Return =", params["agents"]["agent"]["methodargs"]["clamp_return"]) params["agents"]["agent"]["methodargs"]["mean_lambda"] = cpanel["mean_lambda"] params["agents"]["agent"]["methodargs"]["std_lambda"] = cpanel["std_lambda"] params["agents"]["agent"]["methodargs"]["z_lambda"] = cpanel["z_lambda"] ################ params["agents"]["agent"]["sampler_list"] = ["digideep.agent.ddpg.sampler.sampler_re"] params["agents"]["agent"]["sampler_args"] = {"agent_name":params["agents"]["agent"]["name"], "batch_size":cpanel["batch_size"], "observation_path":params["agents"]["agent"]["observation_path"] } # # It deletes the last element from the chunk # params["agents"]["agent"]["sampler"]["truncate_datalists"] = {"n":1} # MUST be 1 to truncate last item: (T+1 --> T) ############# ### Model ### ############# agent_name = params["agents"]["agent"]["name"] observation_path = params["agents"]["agent"]["observation_path"] params["agents"]["agent"]["policyname"] = "digideep.agent.sac.Policy" params["agents"]["agent"]["policyargs"] = {"obs_space": params["env"]["config"]["observation_space"][observation_path], "act_space": params["env"]["config"]["action_space"][agent_name], "hidden_size": 256, "value_args": {"init_w":0.003}, "softq_args": {"init_w":0.003}, "actor_args": {"init_w":0.003, "log_std_min":-20, "log_std_max":2}, "average_args": {"mode":"soft", "polyak_factor":cpanel["polyak_factor"]}, # # {"mode":"hard", "interval":10000} } # lim = params["env"]["config"]["action_space"][agent_name]["lim"][1][0] # # params["agents"]["agent"]["noisename"] = "digideep.agent.noises.EGreedyNoise" # # params["agents"]["agent"]["noiseargs"] = {"std":cpanel["noise_std"], "e":0.3, "lim": lim} # params["agents"]["agent"]["noisename"] = "digideep.agent.noises.OrnsteinUhlenbeckNoise" # params["agents"]["agent"]["noiseargs"] = {"mu":0, "theta":0.15, "sigma":cpanel["noise_std"], "lim":lim} # # params["agents"]["agent"]["noiseargs"] = {"mu":0, "theta":0.15, "sigma":1} params["agents"]["agent"]["optimname_value"] = "torch.optim.Adam" params["agents"]["agent"]["optimargs_value"] = {"lr":cpanel["lr_value"]} # , "eps":cpanel["eps"] params["agents"]["agent"]["optimname_softq"] = "torch.optim.Adam" params["agents"]["agent"]["optimargs_softq"] = {"lr":cpanel["lr_softq"]} # , "eps":cpanel["eps"] params["agents"]["agent"]["optimname_actor"] = "torch.optim.Adam" params["agents"]["agent"]["optimargs_actor"] = {"lr":cpanel["lr_actor"]} # , "eps":cpanel["eps"] # # RMSprop optimizer alpha # # params["agents"]["agent"]["optimargs"] = {"lr":1e-2, "alpha":0.99, "eps":1e-5, "weight_decay":0, "momentum":0, "centered":False} ############################################## ############################################## ### Memory ### ############## params["memory"] = {} params["memory"]["train"] = {} params["memory"]["train"]["type"] = "digideep.memory.ringbuffer.Memory" params["memory"]["train"]["args"] = {"name":"train", "keep_old_checkpoints":cpanel.get("keep_old_checkpoints", False), "chunk_sample_len":cpanel["n_steps"], "buffer_chunk_len":cpanel["memory_size_in_chunks"], "overrun":1} ############################################## ############################################## ### Explorer ### ################ params["explorer"] = {} params["explorer"]["train"] = {} params["explorer"]["train"]["mode"] = "train" params["explorer"]["train"]["env"] = params["env"] params["explorer"]["train"]["do_reset"] = False params["explorer"]["train"]["final_action"] = False params["explorer"]["train"]["warm_start"] = cpanel["warm_start"] # In less than "warm_start" steps the agent will take random actions. params["explorer"]["train"]["num_workers"] = cpanel["num_workers"] params["explorer"]["train"]["deterministic"] = False # MUST: Takes random actions params["explorer"]["train"]["n_steps"] = cpanel["n_steps"] # Number of steps to take a step in the environment params["explorer"]["train"]["n_episodes"] = None # Do not limit # of episodes params["explorer"]["train"]["win_size"] = 20 # Number of episodes to episode reward for report params["explorer"]["train"]["render"] = False params["explorer"]["train"]["render_delay"] = 0 params["explorer"]["train"]["seed"] = cpanel["seed"] + 90 params["explorer"]["train"]["extra_env_kwargs"] = {} params["explorer"]["test"] = {} params["explorer"]["test"]["mode"] = "test" params["explorer"]["test"]["env"] = params["env"] params["explorer"]["test"]["do_reset"] = True params["explorer"]["test"]["final_action"] = False params["explorer"]["test"]["warm_start"] = 0 params["explorer"]["test"]["num_workers"] = cpanel["num_workers"] # We can use the same amount of workers for testing! params["explorer"]["test"]["deterministic"] = True # MUST: Takes the best action params["explorer"]["test"]["n_steps"] = None # Do not limit # of steps params["explorer"]["test"]["n_episodes"] = cpanel["test_win_size"] params["explorer"]["test"]["win_size"] = cpanel["test_win_size"] # Extra episodes won't be counted params["explorer"]["test"]["render"] = False params["explorer"]["test"]["render_delay"] = 0 params["explorer"]["test"]["seed"] = cpanel["seed"] + 100 # We want to make the seed of test environments different from training. params["explorer"]["test"]["extra_env_kwargs"] = {} params["explorer"]["eval"] = {} params["explorer"]["eval"]["mode"] = "eval" params["explorer"]["eval"]["env"] = params["env"] params["explorer"]["eval"]["do_reset"] = False params["explorer"]["eval"]["final_action"] = False params["explorer"]["eval"]["warm_start"] = 0 params["explorer"]["eval"]["num_workers"] = 1 params["explorer"]["eval"]["deterministic"] = True # MUST: Takes the best action params["explorer"]["eval"]["n_steps"] = None # Do not limit # of steps params["explorer"]["eval"]["n_episodes"] = 1 params["explorer"]["eval"]["win_size"] = -1 params["explorer"]["eval"]["render"] = True params["explorer"]["eval"]["render_delay"] = 0 params["explorer"]["eval"]["seed"] = cpanel["seed"] + 101 # We want to make the seed of eval environment different from test/train. params["explorer"]["eval"]["extra_env_kwargs"] = {} ############################################## return params
def gen_params(cpanel): params = {} # Environment params["env"] = {} params["env"]["name"] = cpanel["model_name"] # Other possible modules: roboschool | pybullet_envs params["env"]["from_module"] = cpanel.get("from_module", '') params["env"]["from_params"] = cpanel.get("from_params", False) ############################################## ### Normal Wrappers ### ####################### norm_wrappers = [] # Converting observation to 1 level norm_wrappers.append( dict(name="digideep.environment.wrappers.normal.WrapperLevelDictObs", args={ "path": cpanel["observation_key"], }, enabled=False)) # Normalizing actions (to be in [-1, 1]) norm_wrappers.append( dict( name= "digideep.environment.wrappers.normalizers.WrapperNormalizeActDict", args={"paths": ["agent"]}, enabled=False)) ############################################## ### Vector Wrappers ### ####################### vect_wrappers = [] # Normalizing observations vect_wrappers.append( dict(name= "digideep.environment.wrappers.normalizers.VecNormalizeObsDict", args={ "paths": [cpanel["observation_key"]], "clip": 10, "epsilon": 1e-8 }, enabled=True)) # Normalizing rewards vect_wrappers.append( dict(name="digideep.environment.wrappers.normalizers.VecNormalizeRew", args={ "clip": 10, "gamma": cpanel["gamma"], "epsilon": 1e-8 }, enabled=True)) ############################################## params["env"]["main_wrappers"] = { "Monitor": { "allow_early_resets": True, # We need it to allow early resets in the test environment. "reset_keywords": (), "info_keywords": () }, "WrapperDummyMultiAgent": { "agent_name": "agent" }, "WrapperDummyDictObs": { "observation_key": "agent" } } params["env"]["norm_wrappers"] = norm_wrappers params["env"]["vect_wrappers"] = vect_wrappers menv = MakeEnvironment(session=None, mode=None, seed=1, **params["env"]) params["env"]["config"] = menv.get_config() # Some parameters # params["env"]["gamma"] = 1-1/params["env"]["config"]["max_steps"] # 0.98 ##################################### # Runner: [episode < cycle < epoch] # ##################################### params["runner"] = {} params["runner"]["name"] = cpanel.get("runner_name", "digideep.pipeline.Runner") params["runner"]["max_time"] = cpanel.get("max_exec_time", None) params["runner"]["max_iter"] = cpanel.get("max_exec_iter", None) params["runner"]["n_cycles"] = cpanel[ "epoch_size"] # Meaning that 100 cycles are 1 epoch. params["runner"]["n_epochs"] = cpanel[ "number_epochs"] # Testing and savings are done after each epoch. params["runner"]["randargs"] = { 'seed': cpanel["seed"], 'cuda_deterministic': cpanel["cuda_deterministic"] } params["runner"]["test_act"] = cpanel["test_activate"] # Test Activate params["runner"]["test_int"] = cpanel["test_interval"] # Test Interval params["runner"]["save_int"] = cpanel["save_interval"] # Save Interval # We "save" after each epoch is done. # We "test" after each epoch is done. params["agents"] = {} ############################################## ### Agent (#1) ### ################## params["agents"]["agent"] = {} params["agents"]["agent"]["name"] = "agent" params["agents"]["agent"]["type"] = cpanel["agent_type"] params["agents"]["agent"]["observation_path"] = cpanel["observation_key"] params["agents"]["agent"]["methodargs"] = {} params["agents"]["agent"]["methodargs"]["n_steps"] = cpanel[ "n_steps"] # Same as "num_steps" / T params["agents"]["agent"]["methodargs"]["n_update"] = cpanel[ "n_update"] # Number of times to perform PPO update. Alternative name: PPO_EPOCH params["agents"]["agent"]["methodargs"]["clip_param"] = cpanel[ "clip_param"] # PPO clip parameter params["agents"]["agent"]["methodargs"]["value_loss_coef"] = cpanel[ "value_loss_coef"] # Value loss coefficient params["agents"]["agent"]["methodargs"]["entropy_coef"] = cpanel[ "entropy_coef"] # Entropy term coefficient params["agents"]["agent"]["methodargs"]["max_grad_norm"] = cpanel[ "max_grad_norm"] # Max norm of gradients params["agents"]["agent"]["methodargs"]["use_clipped_value_loss"] = cpanel[ "use_clipped_value_loss"] params["agents"]["agent"]["sampler"] = {} params["agents"]["agent"]["sampler"]["agent_name"] = params["agents"][ "agent"]["name"] params["agents"]["agent"]["sampler"]["num_mini_batches"] = cpanel[ "num_mini_batches"] params["agents"]["agent"]["sampler"]["compute_advantages"] = { "gamma": cpanel["gamma"], # Discount factor for rewards "tau": cpanel["tau"], # GAE parameter "use_gae": cpanel["use_gae"] } # It deletes the last element from the chunk params["agents"]["agent"]["sampler"]["truncate_datalists"] = { "n": 1 } # MUST be 1 to truncate last item: (T+1 --> T) params["agents"]["agent"]["sampler"]["observation_path"] = params[ "agents"]["agent"]["observation_path"] ############# ### Model ### ############# agent_name = params["agents"]["agent"]["name"] observation_path = params["agents"]["agent"]["observation_path"] params["agents"]["agent"]["policyname"] = "digideep.agent.ppo.Policy" params["agents"]["agent"]["policyargs"] = { "obs_space": params["env"]["config"]["observation_space"][observation_path], "act_space": params["env"]["config"]["action_space"][agent_name], "modelname": "digideep.model.models.MLPModel", "modelargs": { "recurrent": cpanel["recurrent"], "output_size": cpanel["actor_feature_size"] } } params["agents"]["agent"]["optimname"] = "torch.optim.Adam" params["agents"]["agent"]["optimargs"] = { "lr": cpanel["lr"], "eps": cpanel["eps"] } # RMSprop optimizer apha # params["agents"]["agent"]["optimargs"] = {"lr":1e-2, "alpha":0.99, "eps":1e-5, "weight_decay":0, "momentum":0, "centered":False} ############################################## ############################################## ### Memory ### ############## params["memory"] = {} params["memory"]["train"] = {} params["memory"]["train"]["type"] = "digideep.memory.rollbuffer.Memory" params["memory"]["train"]["args"] = { "name": "train", "chunk_sample_len": cpanel["n_steps"], "buffer_chunk_len": cpanel["memory_size_in_chunks"], "overrun": 1 } ############################################## ############################################## ### Explorer ### ################ params["explorer"] = {} params["explorer"]["train"] = {} params["explorer"]["train"]["mode"] = "train" params["explorer"]["train"]["env"] = params["env"] params["explorer"]["train"]["do_reset"] = False params["explorer"]["train"]["final_action"] = True params["explorer"]["train"]["warm_start"] = cpanel[ "warm_start"] # In less than "warm_start" steps the agent will take random actions. params["explorer"]["train"]["num_workers"] = cpanel["num_workers"] params["explorer"]["train"][ "deterministic"] = False # MUST: Takes random actions params["explorer"]["train"]["n_steps"] = cpanel[ "n_steps"] # Number of steps to take a step in the environment params["explorer"]["train"][ "n_episodes"] = None # Do not limit # of episodes params["explorer"]["train"][ "win_size"] = 10 # Number of episodes to episode reward for report params["explorer"]["train"]["render"] = False params["explorer"]["train"]["render_delay"] = 0 params["explorer"]["train"]["seed"] = cpanel["seed"] # + 3500 params["explorer"]["train"]["extra_env_kwargs"] = {} params["explorer"]["test"] = {} params["explorer"]["test"]["mode"] = "test" params["explorer"]["test"]["env"] = params["env"] params["explorer"]["test"]["do_reset"] = False params["explorer"]["test"]["final_action"] = False params["explorer"]["test"]["warm_start"] = 0 params["explorer"]["test"]["num_workers"] = cpanel[ "num_workers"] # We can use the same amount of workers for testing! params["explorer"]["test"][ "deterministic"] = True # MUST: Takes the best action params["explorer"]["test"]["n_steps"] = None # Do not limit # of steps params["explorer"]["test"]["n_episodes"] = cpanel["test_win_size"] params["explorer"]["test"]["win_size"] = cpanel[ "test_win_size"] # Extra episodes won't be counted params["explorer"]["test"]["render"] = False params["explorer"]["test"]["render_delay"] = 0 params["explorer"]["test"]["seed"] = cpanel[ "seed"] + 100 # We want to make the seed of test environments different from training. params["explorer"]["test"]["extra_env_kwargs"] = {} params["explorer"]["eval"] = {} params["explorer"]["eval"]["mode"] = "eval" params["explorer"]["eval"]["env"] = params["env"] params["explorer"]["eval"]["do_reset"] = False params["explorer"]["eval"]["final_action"] = False params["explorer"]["eval"]["warm_start"] = 0 params["explorer"]["eval"]["num_workers"] = 1 params["explorer"]["eval"][ "deterministic"] = True # MUST: Takes the best action params["explorer"]["eval"]["n_steps"] = None # Do not limit # of steps params["explorer"]["eval"]["n_episodes"] = 1 params["explorer"]["eval"]["win_size"] = -1 params["explorer"]["eval"]["render"] = True params["explorer"]["eval"]["render_delay"] = 0 params["explorer"]["eval"]["seed"] = cpanel[ "seed"] + 101 # We want to make the seed of eval environment different from test/train. params["explorer"]["eval"]["extra_env_kwargs"] = {} ############################################## return params
def gen_params(cpanel): params = {} # Environment params["env"] = {} params["env"]["name"] = cpanel["model_name"] params["env"]["from_module"] = cpanel.get("from_module", '') params["env"]["from_params"] = cpanel.get("from_params", False) if params["env"]["from_params"]: # For having environment from parameters from digideep.environment.dmc2gym.registration import EnvCreator from dextron.zoo.hand_env.hand import grasp task_kwargs = {"generator":{"time_scale_offset":cpanel["time_scale_offset"], "time_scale_factor":cpanel["time_scale_factor"], "time_noise_factor":cpanel["time_noise_factor"]}, "random":None, "pub_cameras":cpanel["pub_cameras"]} # "teaching_rate":cpanel["teaching_rate"] # visualize_reward=True environment_kwargs = {"time_limit":cpanel["time_limit"], "control_timestep":0.02} params["env"]["register_args"] = {"id":cpanel["model_name"], "entry_point":"digideep.environment.dmc2gym.wrapper:DmControlWrapper", "kwargs":{'dmcenv_creator':EnvCreator(grasp, task_kwargs=task_kwargs, environment_kwargs=environment_kwargs, visualize_reward=False), 'flat_observation':False, 'observation_key':"agent"} } ############################################## ### Normal Wrappers ### ####################### norm_wrappers = [] # Converting observation to 1 level norm_wrappers.append(dict(name="digideep.environment.wrappers.normal.WrapperLevelDictObs", args={"path":cpanel["observation_key"], }, enabled=True)) # norm_wrappers.append(dict(name="digideep.environment.wrappers.normal.WrapperTransposeImage", # args={"path":"/camera" # }, # enabled=True)) # Normalizing actions (to be in [-1, 1]) norm_wrappers.append(dict(name="digideep.environment.wrappers.normalizers.WrapperNormalizeActDict", args={"paths":["agent"]}, enabled=False)) ############################################## ### Vector Wrappers ### ####################### vect_wrappers = [] if cpanel["pub_cameras"]: vect_wrappers.append(dict(name="digideep.environment.wrappers.vector.VecFrameStackAxis", args={"path":"/camera", "nstack":4, # By DQN Nature paper, it is called: phi length "axis":0}, # Axis=0 is required when ImageTransposeWrapper is called on the Atari games. enabled=True)) # Normalizing observations vect_wrappers.append(dict(name="digideep.environment.wrappers.normalizers.VecNormalizeObsDict", args={"paths":[cpanel["observation_key"]], "clip":5, # 10 "epsilon":1e-8 }, enabled=True)) # Normalizing rewards vect_wrappers.append(dict(name="digideep.environment.wrappers.normalizers.VecNormalizeRew", args={"clip":5, # 10 "gamma":cpanel["gamma"], "epsilon":1e-8 }, enabled=True)) # Not a good idea to normalize sparse rewards. ############################################## params["env"]["main_wrappers"] = {"Monitor":{"allow_early_resets":True, # We need it to allow early resets in the test environment. "reset_keywords":(), "info_keywords":()}, "WrapperDummyMultiAgent":{"agent_name":"agent"}, "WrapperDummyDictObs":{"observation_key":"agent"} } params["env"]["norm_wrappers"] = norm_wrappers params["env"]["vect_wrappers"] = vect_wrappers menv = MakeEnvironment(session=None, mode=None, seed=1, **params["env"]) params["env"]["config"] = menv.get_config() # Some parameters # params["env"]["gamma"] = 1-1/params["env"]["config"]["max_steps"] # 0.98 ##################################### # Runner: [episode < cycle < epoch] # ##################################### params["runner"] = {} params["runner"]["name"] = cpanel.get("runner_name", "digideep.pipeline.Runner") params["runner"]["n_cycles"] = cpanel["epoch_size"] # Meaning that 100 cycles are 1 epoch. params["runner"]["n_epochs"] = cpanel["number_epochs"] # Testing and savings are done after each epoch. params["runner"]["randargs"] = {'seed':cpanel["seed"], 'cuda_deterministic':cpanel["cuda_deterministic"]} params["runner"]["test_act"] = cpanel["test_activate"] # Test Activate params["runner"]["test_int"] = cpanel["test_interval"] # Test Interval params["runner"]["save_int"] = cpanel["save_interval"] # Save Interval # We "save" after each epoch is done. # We "test" after each epoch is done. params["agents"] = {} ############################################## ### Agent (#1) ### Soft Actor-Critic ################## params["agents"]["agent"] = {} params["agents"]["agent"]["name"] = "agent" params["agents"]["agent"]["type"] = "dextron.agent.sac.Agent" # "digideep.agent.sac.Agent" params["agents"]["agent"]["observation_path"] = "/camera" # cpanel["observation_key"] params["agents"]["agent"]["methodargs"] = {} params["agents"]["agent"]["methodargs"]["n_update"] = cpanel["n_update"] # Number of times to perform PPO update. Alternative name: PPO_EPOCH params["agents"]["agent"]["methodargs"]["gamma"] = cpanel["gamma"] # Discount factor Gamma # params["agents"]["agent"]["methodargs"]["clamp_return"] = 1/(1-float(cpanel["gamma"])) # print("Clip Return =", params["agents"]["agent"]["methodargs"]["clamp_return"]) params["agents"]["agent"]["methodargs"]["mean_lambda"] = cpanel["mean_lambda"] params["agents"]["agent"]["methodargs"]["std_lambda"] = cpanel["std_lambda"] params["agents"]["agent"]["methodargs"]["z_lambda"] = cpanel["z_lambda"] ################ demo_batch_size = int(cpanel["demo_use_ratio"] * cpanel["batch_size"]) train_batch_size = cpanel["batch_size"] - demo_batch_size params["agents"]["agent"]["sampler_list"] = ["dextron.agent.sac.multi_sampler.multi_memory_sample"] params["agents"]["agent"]["sampler_args"] = {"agent_name":params["agents"]["agent"]["name"], "batch_size":cpanel["batch_size"], "scheduler_start":cpanel["scheduler_start"], "scheduler_steps":cpanel["scheduler_steps"], "scheduler_decay":cpanel["scheduler_decay"], "batch_size_dict":{"train":train_batch_size, "demo":demo_batch_size}, "observation_path":params["agents"]["agent"]["observation_path"] } # replay_batch_size = int(cpanel["replay_use_ratio"] * cpanel["batch_size"]) # train_batch_size = cpanel["batch_size"] - replay_batch_size # # params["agents"]["agent"]["sampler_list"] = ["dextron.agent.sac.multi_sampler.multi_memory_sample"] # params["agents"]["agent"]["sampler_args"] = {"agent_name": params["agents"]["agent"]["name"], # "batch_size_dict": {"train":train_batch_size, "replay":replay_batch_size}, # "observation_path": params["agents"]["agent"]["observation_path"] # } # # It deletes the last element from the chunk # params["agents"]["agent"]["sampler"]["truncate_datalists"] = {"n":1} # MUST be 1 to truncate last item: (T+1 --> T) ############# ### Model ### ############# agent_name = params["agents"]["agent"]["name"] observation_path = params["agents"]["agent"]["observation_path"] # params["agents"]["agent"]["policyname"] = "digideep.agent.sac.Policy" params["agents"]["agent"]["policyargs"] = {"obs_space": params["env"]["config"]["observation_space"][observation_path], "act_space": params["env"]["config"]["action_space"][agent_name], "image_repr_size": 64, "hidden_size": 256, "value_args": {"init_w":0.003}, "softq_args": {"init_w":0.003}, "actor_args": {"init_w":0.003, "log_std_min":-20, "log_std_max":2}, "average_args": {"mode":"soft", "polyak_factor":cpanel["polyak_factor"]}, # # {"mode":"hard", "interval":10000} } # lim = params["env"]["config"]["action_space"][agent_name]["lim"][1][0] # # params["agents"]["agent"]["noisename"] = "digideep.agent.noises.EGreedyNoise" # # params["agents"]["agent"]["noiseargs"] = {"std":cpanel["noise_std"], "e":0.3, "lim": lim} # params["agents"]["agent"]["noisename"] = "digideep.agent.noises.OrnsteinUhlenbeckNoise" # params["agents"]["agent"]["noiseargs"] = {"mu":0, "theta":0.15, "sigma":cpanel["noise_std"], "lim":lim} # # params["agents"]["agent"]["noiseargs"] = {"mu":0, "theta":0.15, "sigma":1} params["agents"]["agent"]["optimname_value"] = "torch.optim.Adam" params["agents"]["agent"]["optimargs_value"] = {"lr":cpanel["lr_value"]} # , "eps":cpanel["eps"] params["agents"]["agent"]["optimname_softq"] = "torch.optim.Adam" params["agents"]["agent"]["optimargs_softq"] = {"lr":cpanel["lr_softq"]} # , "eps":cpanel["eps"] params["agents"]["agent"]["optimname_actor"] = "torch.optim.Adam" params["agents"]["agent"]["optimargs_actor"] = {"lr":cpanel["lr_actor"]} # , "eps":cpanel["eps"] # # RMSprop optimizer alpha # # params["agents"]["agent"]["optimargs"] = {"lr":1e-2, "alpha":0.99, "eps":1e-5, "weight_decay":0, "momentum":0, "centered":False} ############################################## ############################################## ### Agent (#2) ### Demonstrator ################## params["agents"]["demonstrator"] = {} params["agents"]["demonstrator"]["name"] = "demonstrator" params["agents"]["demonstrator"]["type"] = "dextron.agent.demonstrator.NaiveController" params["agents"]["demonstrator"]["methodargs"] = {} agent_name = params["agents"]["demonstrator"]["name"] params["agents"]["demonstrator"]["methodargs"]["act_space"] = params["env"]["config"]["action_space"][agent_name] ############################################## ############################################## ### Memory ### ############## params["memory"] = {} # TODO: The memory size in chunks should be proportionately distributed. We think that "demo" should have a # smaller memory size. # "digideep.memory.generic.Memory" | "digideep.memory.ringbuffer.Memory" # chunk_sample_len: Number of samples in a chunk # buffer_chunk_len: Number of chunks in the buffer params["memory"]["train"] = {} params["memory"]["train"]["type"] = "digideep.memory.ringbuffer.Memory" params["memory"]["train"]["args"] = {"chunk_sample_len":cpanel["n_steps"], "buffer_chunk_len":cpanel["memory_size_in_chunks"], "overrun":1} params["memory"]["demo"] = {} params["memory"]["demo"]["type"] = "digideep.memory.ringbuffer.Memory" params["memory"]["demo"]["args"] = {"chunk_sample_len":cpanel["n_steps"], "buffer_chunk_len":cpanel["demo_memory_size_in_chunks"], "overrun":1} # params["memory"]["replay"] = {} # params["memory"]["replay"]["type"] = "digideep.memory.ringbuffer.Memory" # params["memory"]["replay"]["args"] = {"chunk_sample_len":cpanel["replay_nsteps"], "buffer_chunk_len":cpanel["memory_size_in_chunks"]} ############################################## ############################################## ### Explorer ### ################ params["explorer"] = {} params["explorer"]["train"] = {} params["explorer"]["train"]["mode"] = "train" params["explorer"]["train"]["env"] = params["env"] params["explorer"]["train"]["do_reset"] = False params["explorer"]["train"]["final_action"] = False params["explorer"]["train"]["warm_start"] = cpanel["warm_start"] # In less than "warm_start" steps the agent will take random actions. params["explorer"]["train"]["num_workers"] = cpanel["num_workers"] params["explorer"]["train"]["deterministic"] = False # MUST: Takes random actions params["explorer"]["train"]["n_steps"] = cpanel["n_steps"] # Number of steps to take a step in the environment params["explorer"]["train"]["n_episodes"] = None # Do not limit # of episodes params["explorer"]["train"]["win_size"] = 20 # Number of episodes to episode reward for report params["explorer"]["train"]["render"] = False params["explorer"]["train"]["render_delay"] = 0 params["explorer"]["train"]["seed"] = cpanel["seed"] + 90 params["explorer"]["train"]["extra_env_kwargs"] = {"mode":params["explorer"]["train"]["mode"], "allow_demos":False} params["explorer"]["test"] = {} params["explorer"]["test"]["mode"] = "test" params["explorer"]["test"]["env"] = params["env"] params["explorer"]["test"]["do_reset"] = True params["explorer"]["test"]["final_action"] = False params["explorer"]["test"]["warm_start"] = 0 params["explorer"]["test"]["num_workers"] = cpanel["num_workers"] # We can use the same amount of workers for testing! params["explorer"]["test"]["deterministic"] = True # MUST: Takes the best action params["explorer"]["test"]["n_steps"] = None # Do not limit # of steps params["explorer"]["test"]["n_episodes"] = cpanel["test_win_size"] params["explorer"]["test"]["win_size"] = cpanel["test_win_size"] # Extra episodes won't be counted params["explorer"]["test"]["render"] = False params["explorer"]["test"]["render_delay"] = 0 params["explorer"]["test"]["seed"] = cpanel["seed"] + 100 # We want to make the seed of test environments different from training. params["explorer"]["test"]["extra_env_kwargs"] = {"mode":params["explorer"]["test"]["mode"], "allow_demos":False} params["explorer"]["eval"] = {} params["explorer"]["eval"]["mode"] = "eval" params["explorer"]["eval"]["env"] = params["env"] params["explorer"]["eval"]["do_reset"] = False params["explorer"]["eval"]["final_action"] = False params["explorer"]["eval"]["warm_start"] = 0 params["explorer"]["eval"]["num_workers"] = 1 params["explorer"]["eval"]["deterministic"] = True # MUST: Takes the best action params["explorer"]["eval"]["n_steps"] = None # Do not limit # of steps params["explorer"]["eval"]["n_episodes"] = 1 params["explorer"]["eval"]["win_size"] = -1 params["explorer"]["eval"]["render"] = True params["explorer"]["eval"]["render_delay"] = 0 params["explorer"]["eval"]["seed"] = cpanel["seed"] + 101 # We want to make the seed of eval environment different from test/train. params["explorer"]["eval"]["extra_env_kwargs"] = {"mode":params["explorer"]["eval"]["mode"], "allow_demos":cpanel.get("allow_demos", False)} ############################################## params["explorer"]["demo"] = {} params["explorer"]["demo"]["mode"] = "demo" params["explorer"]["demo"]["env"] = params["env"] params["explorer"]["demo"]["do_reset"] = False params["explorer"]["demo"]["final_action"] = False params["explorer"]["demo"]["warm_start"] = 0 params["explorer"]["demo"]["num_workers"] = cpanel["num_workers"] params["explorer"]["demo"]["deterministic"] = False # MUST: Takes random actions params["explorer"]["demo"]["n_steps"] = cpanel["n_steps"] # Number of steps to take a step in the environment params["explorer"]["demo"]["n_episodes"] = None params["explorer"]["demo"]["win_size"] = -1 params["explorer"]["demo"]["render"] = False # True # False params["explorer"]["demo"]["render_delay"] = 0 params["explorer"]["demo"]["seed"] = cpanel["seed"] + 50 params["explorer"]["demo"]["extra_env_kwargs"] = {"mode":params["explorer"]["demo"]["mode"], "allow_demos":True} # params["explorer"]["replay"] = {} # params["explorer"]["replay"]["mode"] = "replay" # params["explorer"]["replay"]["env"] = params["env"] # params["explorer"]["replay"]["do_reset"] = False # params["explorer"]["replay"]["final_action"] = False # params["explorer"]["replay"]["warm_start"] = 0 # params["explorer"]["replay"]["num_workers"] = cpanel["num_workers"] # params["explorer"]["replay"]["deterministic"] = False # MUST: Takes random actions # params["explorer"]["replay"]["n_steps"] = cpanel["replay_nsteps"] # Number of steps to take a step in the environment # params["explorer"]["replay"]["n_episodes"] = None # params["explorer"]["replay"]["win_size"] = 10 # params["explorer"]["replay"]["render"] = False # False # params["explorer"]["replay"]["render_delay"] = 0 # params["explorer"]["replay"]["seed"] = cpanel["seed"] + 50 # params["explorer"]["replay"]["extra_env_kwargs"] = {"mode":params["explorer"]["replay"]["mode"], "allow_demos":False} return params