def save_config(self): """ Save config into a yaml file in root experiment directory """ folder = Path(self.session_config.folder) folder.mkdir(exist_ok=True, parents=True) config = Config(learner_config=self.learner_config, env_config=self.env_config, session_config=self.session_config) config.dump_file(str(folder / 'config.yml'))
def __init__(self, env, learner_config, session_config): """ Default sender configs are in BASE_SESSION_CONFIG['sender'] They contain communication level information Algorithm specific experience generation parameters should live in learner_config """ super().__init__(env) # TODO: initialize config in a unified place self.session_config = Config(session_config).extend( BASE_SESSION_CONFIG) self.learner_config = Config(learner_config).extend( BASE_LEARNER_CONFIG) host = os.environ['SYMPH_COLLECTOR_FRONTEND_HOST'] port = os.environ['SYMPH_COLLECTOR_FRONTEND_PORT'] self.sender = ExpSender( host=host, port=port, flush_iteration=self.session_config.sender.flush_iteration, )
PPO_DEFAULT_LEARNER_CONFIG = Config({ 'model': { 'convs': [], # this can wait until TorchX 'actor_fc_hidden_sizes': [300, 200], 'critic_fc_hidden_sizes': [300, 200], 'cnn_feature_dim': 256, 'use_layernorm': False, }, 'algo': { # base configs # 'agent_class': 'PPOAgent', # 'learner_class': 'PPOLearner', # 'experience': 'ExpSenderWrapperMultiStepMovingWindowWithInfo', 'use_z_filter': True, 'use_r_filter': False, 'gamma': .995, 'n_step': 25, # 10 for without RNN 'stride': 20, # 10 for without RNN 'network': { 'lr_actor': 1e-4, 'lr_critic': 1e-4, 'clip_actor_gradient': True, 'actor_gradient_norm_clip': 5., 'clip_critic_gradient': True, 'critic_gradient_norm_clip': 5., 'actor_regularization': 0.0, 'critic_regularization': 0.0, 'anneal': { 'lr_scheduler': "LinearWithMinLR", 'frames_to_anneal': 5e6, 'lr_update_frequency': 100, 'min_lr': 5e-5, }, }, # ppo specific parameters: 'ppo_mode': 'adapt', 'advantage': { 'norm_adv': True, 'lam': 0.97, 'reward_scale': 1.0, }, 'rnn': { 'if_rnn_policy': True, 'rnn_hidden': 100, 'rnn_layer': 1, 'horizon': 5, }, 'consts': { 'init_log_sig': -1.0, 'log_sig_range': 0.25, 'epoch_policy': 10, 'epoch_baseline': 10, 'adjust_threshold': (0.5, 2.0), # threshold to magnify clip epsilon 'kl_target': 0.015, # target KL divergence between before and after }, 'adapt_consts': { 'kl_cutoff_coeff': 250, # penalty coeff when kl large 'beta_init': 1.0, # original beta 'beta_range': (1 / 35.0, 35.0), # range of the adapted penalty factor 'scale_constant': 1.5, }, 'clip_consts': { 'clip_epsilon_init': 0.2, # factor of clipped loss 'clip_range': (0.05, 0.3), # range of the adapted penalty factor 'scale_constant': 1.2, }, }, 'replay': { # 'replay_class': 'FIFOReplay', 'batch_size': 64, 'memory_size': 96, 'sampling_start_size': 64, 'replay_shards': 1, }, 'parameter_publish': { 'exp_interval': 4096, }, })
DDPG_DEFAULT_LEARNER_CONFIG = Config({ 'model': { 'convs': [], 'actor_fc_hidden_sizes': [300, 200], 'critic_fc_hidden_sizes': [400, 300], 'use_layernorm': False, 'conv_spec': { # First conv layer: 16 out channels, second layer 32 channels 'out_channels': [16, 32], # First conv layer: kernel size 8, second layer kernel size 4 'kernel_sizes': [8, 4], # First conv layer: stride=4, second layer stride=2 'strides': [4, 2], # After final convolution, reshapes output to flat tensor and feed through mlp with output of this size 'hidden_output_dim': 200, }, }, 'algo': { 'gamma': .99, # Unroll the bellman update 'n_step': 3, # Send experiences every `stride` steps 'stride': 1, 'network': { 'lr_actor': 1e-4, 'lr_critic': 1e-3, 'clip_actor_gradient': True, 'actor_gradient_value_clip': 1., 'clip_critic_gradient': False, 'critic_gradient_value_clip': 5., # Weight regularization 'actor_regularization': 0.0, 'critic_regularization': 0.0, # beta version: see https://arxiv.org/pdf/1802.09477.pdf and # https://github.com/sfujim/TD3/blob/master/TD3.py # for action regularization and double critic algorithm details 'use_action_regularization': False, 'use_double_critic': False, 'target_update': { # Soft: after every iteration, target_params = (1 - tau) * target_params + tau * params #'type': 'soft', #'tau': 1e-3, # Hard: after `interval` iterations, target_params = params 'type': 'hard', 'interval': 500, }, }, 'exploration': { # Beta implementation of parameter noise: # see https://blog.openai.com/better-exploration-with-parameter-noise/ for algorithm details 'param_noise_type': None, # normal parameter noise applies gaussian noise over the agent's parameters # 'param_noise_type': 'normal', # adaptive parameter noise scales the noise sigma up or down in order to achieve the target action # standard deviation # 'param_noise_type': 'adaptive_normal', 'param_noise_sigma': 0.05, 'param_noise_alpha': 1.15, 'param_noise_target_stddev': 0.005, # Vanilla noise: applies gaussian noise on every action 'noise_type': 'normal', 'max_sigma': 1.0, # Or, use Ornstein-Uhlenbeck noise instead of gaussian #'noise_type': 'ou_noise', 'theta': 0.15, 'dt': 1e-3, }, }, 'replay': { 'batch_size': 512, 'memory_size': int(1000000 / 3), # The total replay size is memory_size * replay_shards 'sampling_start_size': 3000, 'replay_shards': 3, }, 'parameter_publish': { # Minimum amount of time (seconds) between two parameter publish 'min_publish_interval': 3, }, })
from surreal.env.mujocomanip.default_env_configs import * from surreal.env.mujocomanip.default_object_configs import * from surreal.env.mujocomanip.mujocomanip_envs import * import copy import numpy as np from surreal.session import Config env_config = DEFAULT_STACKER_CONFIG env_config.display = True object_configs = [] for x in range(5): one_config = DEFAULT_RANDOM_BOX_CONFIG # This doesn't work one_config = Config(DEFAULT_RANDOM_BOX_CONFIG.to_dict()) one_config.seed = np.random.randint(100000) object_configs.append(one_config) env_config.mujoco_objects_spec = object_configs env_config.obs_spec.dim = 9 * len(object_configs) + 28 env = SurrealSawyerStackEnv(env_config) obs, info = env.reset() while True: obs, info = env.reset() ### TODO: we should implement ### TODO: this might need clipping ### action = np.random.randn(8) # action[7] *= 0.020833 for i in range(2000): action = np.random.randn(8) / 2 action[7] = -1
import numpy as np import imageio from surreal.env import make_env from surreal.session import Config env_config = Config({ 'env_name': 'mujocomanip:BaxterLiftEnv', 'pixel_input': True, 'frame_stacks': 3, 'sleep_time': 0.0, # 'limit_episode_length': 200, # 0 means no limit 'limit_episode_length': 1000, # 0 means no limit 'video': { 'record_video': True, 'save_folder': None, 'max_videos': 500, 'record_every': 100, }, 'observation': { 'pixel': ['camera0', 'depth'], # if using ObservationConcatWrapper, low_dim inputs will be concatenated into 'flat_inputs' # 'low_dim':['position', 'velocity', 'proprio', 'cube_pos', 'cube_quat', 'gripper_to_cube'], 'low_dim': ['position', 'velocity', 'proprio'], }, }) writer = imageio.get_writer('baxter_lift.mp4', fps=20) env, env_config = make_env(env_config) obs = env.reset() for i in range(1000):