def __init__(self, args, env, env_params): self.args = args self.env = env self.env_params = env_params # create the network self.actor_network = actor(env_params) self.critic_network = critic(env_params) # sync the networks across the cpus sync_networks(self.actor_network) sync_networks(self.critic_network) # build up the target network self.actor_target_network = actor(env_params) self.critic_target_network = critic(env_params) # load the weights into the target networks self.actor_target_network.load_state_dict( self.actor_network.state_dict()) self.critic_target_network.load_state_dict( self.critic_network.state_dict()) # if use gpu if self.args.cuda and torch.cuda.is_available(): self.actor_network.cuda() self.critic_network.cuda() self.actor_target_network.cuda() self.critic_target_network.cuda() # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic) # her sampler self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env.compute_reward) # create the replay buffer self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions) # create the normalizer self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.args.clip_range) self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range) # add tensorboardX tool # create the dict for store the model if MPI.COMM_WORLD.Get_rank() == 0: if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) # path to save the model self.model_path = os.path.join(self.args.save_dir, self.args.env_name) if not os.path.exists(self.model_path): os.mkdir(self.model_path) print('model path', self.model_path) # self.writer = SummaryWriter('./logs') self.reward_list = [] self.reward_record = [] self.success_rate_list = [] self.success_list = []
def __init__(self, observation_space, action_space, discount=0.99, td_lambda=0.95, hidden_size=(128, 64), temp=1., max_weight=20, action_std=0.4, actor_lr=0.0001, critic_lr=0.01, device='cpu', batch_size=256, pipe=None, optimizer='SGD', activation='relu'): self.device = device inp_dim = observation_space.shape[0] self.actor = actor(inp_dim, action_space.low.shape[0], std=action_std, hidden_size=hidden_size, activation=activation).to(device) self.critic = critic(inp_dim, hidden_size=hidden_size, activation=activation).to(device) self.normalizer = Normalizer((inp_dim, ), default_clip_range=5).to(device) self.normalizer.count += 1 #unbiased ... self.temp = temp self.max_weight = max_weight # NOTE: optimizer is different if optimizer == 'SGD': self.optim_actor = torch.optim.SGD(self.actor.parameters(), actor_lr, momentum=0.9) self.optim_critic = torch.optim.SGD(self.critic.parameters(), critic_lr, momentum=0.9) else: self.optim_actor = torch.optim.Adam(self.actor.parameters(), actor_lr) self.optim_critic = torch.optim.Adam(self.critic.parameters(), critic_lr) self.pipe = pipe self.batch_size = batch_size self.mse = nn.MSELoss() self.discount = discount self.td_lambda = td_lambda self.val_norm = 1.0 / (1.0 - self.discount) self.action_mean = ((action_space.high + action_space.low) / 2)[None, :] self.action_std = ((action_space.high - action_space.low) / 2)[None, :]
def __init__(self, obs_dim, act_dim, env, memory_size=50000, batch_size=64,\ lr_critic=1e-4, lr_actor=1e-4, gamma=0.99, tau=0.001, n_steps = 1): self.gamma = gamma self.batch_size = batch_size self.obs_dim = obs_dim self.act_dim = act_dim self.memory_size = memory_size self.tau = tau self.env = env self.n_steps = n_steps self.n_step_gamma = self.gamma**self.n_steps # actor self.actor = actor(input_size=obs_dim, output_size=act_dim) self.actor_target = actor(input_size=obs_dim, output_size=act_dim) self.actor_target.load_state_dict(self.actor.state_dict()) # critic self.critic = critic(state_size=obs_dim, action_size=act_dim, output_size=1) self.critic_target = critic(state_size=obs_dim, action_size=act_dim, output_size=1) self.critic_target.load_state_dict(self.critic.state_dict()) # optimizers self.optimizer_actor = optim.Adam(self.actor.parameters(), lr=lr_actor) self.optimizer_critic = optim.Adam(self.critic.parameters(), lr=lr_critic) # critic loss self.critic_loss = nn.MSELoss() # noise # self.noise = OrnsteinUhlenbeckProcess(dimension=act_dim, num_steps=5000) self.noise = GaussianNoise(dimension=act_dim, num_epochs=5000) # replay buffer #self.replayBuffer = Replay(self.memory_size, window_length=1) self.replayBuffer = Replay(self.memory_size, self.env)
def __init__(self, args, env, env_params, image=True): self.args = args self.env = env self.env_params = env_params self.image = image # create the network if self.image: self.actor_network = actor_image(env_params, env_params['obs']) self.critic_network = critic_image(env_params, env_params['obs'] + env_params['action']) else: self.actor_network = actor(env_params, env_params['obs']) self.critic_network = critic(env_params, env_params['obs'] + env_params['action']) # load model if load_path is not None if self.args.load_dir != '': actor_load_path = self.args.load_dir + '/actor.pt' model = torch.load(actor_load_path) self.actor_network.load_state_dict(model) critic_load_path = self.args.load_dir + '/critic.pt' model = torch.load(critic_load_path) self.critic_network.load_state_dict(model) # sync the networks across the cpus # sync_networks(self.actor_network) # sync_networks(self.critic_network) # build up the target network # if self.image: # self.actor_target_network = actor_image(env_params, env_params['obs']) # else: # self.actor_target_network = actor(env_params, env_params['obs']) # # load the weights into the target networks # self.actor_target_network.load_state_dict(self.actor_network.state_dict()) # if use gpu if self.args.cuda: self.actor_network.cuda() # self.actor_target_network.cuda() self.critic_network.cuda() # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic) # her sampler self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env().compute_reward) # create the replay buffer self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions, image=self.image) # path to save the model self.model_path = os.path.join(self.args.save_dir, self.args.env_name)
def __init__(self, env, args): self.env = env self.args = args # get the dims and action max of the environment obs_dims = self.env.observation_space.shape[0] self.action_dims = self.env.action_space.shape[0] self.action_max = self.env.action_space.high[0] # define the network self.actor_net = actor(obs_dims, self.action_dims) self.critic_net = critic(obs_dims, self.action_dims) # sync the weights across the mpi sync_networks(self.actor_net) sync_networks(self.critic_net) # build the target newtork self.actor_target_net = copy.deepcopy(self.actor_net) self.critic_target_net = copy.deepcopy(self.critic_net) # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_net.parameters(), self.args.lr_actor) self.critic_optim = torch.optim.Adam( self.critic_net.parameters(), self.args.lr_critic, weight_decay=self.args.critic_l2_reg) # create the replay buffer self.replay_buffer = replay_buffer(self.args.replay_size) # create the normalizer self.o_norm = normalizer(obs_dims, default_clip_range=self.args.clip_range) # create the noise generator self.noise_generator = ounoise(std=0.2, action_dim=self.action_dims) # create the dir to save models if MPI.COMM_WORLD.Get_rank() == 0: if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) self.model_path = os.path.join(self.args.save_dir, self.args.env_name) if not os.path.exists(self.model_path): os.mkdir(self.model_path) # create a eval environemnt self.eval_env = gym.make(self.args.env_name) # set seeds self.eval_env.seed(self.args.seed * 2 + MPI.COMM_WORLD.Get_rank())
'n_classes': params.n_classes, 'pool_type': params.pool_type, 'nonlinear_fc': params.nonlinear_fc, 'encoder_type': params.encoder_type, 'use_cuda': True, } # model encoder_types = [ 'InferSent', 'BLSTMprojEncoder', 'BGRUlastEncoder', 'InnerAttentionMILAEncoder', 'InnerAttentionYANGEncoder', 'InnerAttentionNAACLEncoder', 'ConvNetEncoder', 'LSTMEncoder' ] assert params.encoder_type in encoder_types, "encoder_type must be in " + \ str(encoder_types) nli_net = critic(config_nli_model) actorModel = actor(params.enc_lstm_dim, params.word_emb_dim) print(nli_net) print(actorModel) for name, x in nli_net.named_parameters(): print(name) for name, x in actorModel.named_parameters(): print(name) #print(nli_net.target_pred.enc_lstm.weight_ih_l0) #print(nli_net.target_classifier[4].bias) # loss weight = torch.FloatTensor(params.n_classes).fill_(1)
def __init__(self, args, env, env_params): self.args = args self.env = env self.env_params = env_params # create the network self.actor_network = actor(env_params) self.critic_network = critic(env_params) # sync the networks across the cpus #sync_networks(self.actor_network) #sync_networks(self.critic_network) # build up the target network self.actor_target_network = actor(env_params) self.critic_target_network = critic(env_params) # Load the model if required if args.load_path != None: o_mean, o_std, g_mean, g_std, load_actor_model, load_critic_model = torch.load( args.load_path, map_location=lambda storage, loc: storage) self.actor_network.load_state_dict(load_actor_model) self.critic_network.load_state_dict(load_critic_model) # load the weights into the target networks self.actor_target_network.load_state_dict( self.actor_network.state_dict()) self.critic_target_network.load_state_dict( self.critic_network.state_dict()) # if use gpu if self.args.cuda: self.actor_network.cuda() self.critic_network.cuda() self.actor_target_network.cuda() self.critic_target_network.cuda() # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic) # her sampler self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env.compute_reward) # create the replay buffer self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions) # create the normalizer self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.args.clip_range) self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range) # create the dict for store the model #if MPI.COMM_WORLD.Get_rank() == 0: if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) # makeup a suffix for the model path to indicate which method is used for Training #self.folder_siffix = '_' + self.args.replay_strategy + '_' + self.args.env_params.reward_type # path to save the model self.model_path = os.path.join(self.args.save_dir, self.args.env_name) if not os.path.exists(self.model_path): os.mkdir(self.model_path) self.model_path = os.path.join(self.model_path, 'seed_' + str(self.args.seed)) if not os.path.exists(self.model_path): os.mkdir(self.model_path)
def __init__(self, args, env, env_params): self.savetime = 0 self.args = args self.env = env self.env_params = env_params # create the network self.actor_network = actor(env_params) self.critic_network = critic(env_params) # sync the networks across the cpus sync_networks(self.actor_network) sync_networks(self.critic_network) # build up the target network self.actor_target_network = actor(env_params) self.critic_target_network = critic(env_params) # load the weights into the target networks self.actor_target_network.load_state_dict( self.actor_network.state_dict()) self.critic_target_network.load_state_dict( self.critic_network.state_dict()) # if use gpu if self.args.cuda: self.actor_network.cuda() self.critic_network.cuda() self.actor_target_network.cuda() self.critic_target_network.cuda() # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic) # her sampler self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env.compute_reward) # create the replay buffer self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions) # 是否加入示教数据 if self.args.add_demo: self._init_demo_buffer( ) # initialize replay buffer with demonstration # create the normalizer self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.args.clip_range) self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range) # load the data to continue the training # model_path = "saved_models/bmirobot-v3/125_True12_model.pt" # # # model_path = args.save_dir + args.env_name + '/' + str(args.seed) + '_' + str(args.add_demo) + '_model.pt' # # o_mean, o_std, g_mean, g_std, model = torch.load(model_path, map_location=lambda storage, loc: storage) # self.actor_network.load_state_dict(model) # self.o_norm.mean=o_mean # self.o_norm.std=o_std # self.g_norm.mean=g_mean # self.g_norm.std=g_std self.success_rates = [] # 记录每个epoch的成功率 # create the dict for store the model if MPI.COMM_WORLD.Get_rank() == 0: if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) # path to save the model self.model_path = os.path.join(self.args.save_dir, self.args.env_name) if not os.path.exists(self.model_path): os.mkdir(self.model_path)
'n_classes': params.n_classes, 'pool_type': params.pool_type, 'nonlinear_fc': params.nonlinear_fc, 'encoder_type': params.encoder_type, 'use_cuda': True, } # model encoder_types = [ 'InferSent', 'BLSTMprojEncoder', 'BGRUlastEncoder', 'InnerAttentionMILAEncoder', 'InnerAttentionYANGEncoder', 'InnerAttentionNAACLEncoder', 'ConvNetEncoder', 'LSTMEncoder' ] assert params.encoder_type in encoder_types, "encoder_type must be in " + \ str(encoder_types) criticModel = critic(config_nli_model, transformer_opt) actorModel = actor(params.enc_lstm_dim, params.word_emb_dim) print(criticModel) print(actorModel) #nli_net.load_state_dict(torch.load(os.path.join(params.outputdir, params.outputmodelname))) # loss weight = torch.FloatTensor(params.n_classes).fill_(1) loss_fn = nn.CrossEntropyLoss(weight=weight) loss_fn.size_average = False # optimizer optim_fn1, optim_params1 = get_optimizer(params.optimizer) critic_target_optimizer = optim_fn1( list(criticModel.target_pred.parameters()) +
def __init__(self, args, env, env_params): self.args = args self.env = env self.env_params = env_params # create the network self.actor_network = actor(env_params) self.critic_network = critic(env_params) # build up the target network self.actor_target_network = actor(env_params) self.critic_target_network = critic(env_params) # Load the model if required if args.load_path != None: o_mean, o_std, g_mean, g_std, load_actor_model, load_critic_model = torch.load( args.load_path, map_location=lambda storage, loc: storage) self.actor_network.load_state_dict(load_actor_model) self.critic_network.load_state_dict(load_critic_model) # load the weights into the target networks self.actor_target_network.load_state_dict( self.actor_network.state_dict()) self.critic_target_network.load_state_dict( self.critic_network.state_dict()) # if use gpu if self.args.cuda: self.actor_network.cuda() self.critic_network.cuda() self.actor_target_network.cuda() self.critic_target_network.cuda() # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic) # her sampler self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env.compute_reward) # create the replay buffer if self.args.replay_strategy == 'future': self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions) else: self.buffer = replay_buffer( self.env_params, self.args.buffer_size, self.her_module.sample_normal_transitions) # create the normalizer self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.args.clip_range) self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range) # create the dict for store the model if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) # makeup a suffix for the model path to indicate which method is used for Training buffer_len_epochs = int( self.args.buffer_size / (env_params['max_timesteps'] * self.args.num_rollouts_per_cycle * self.args.n_cycles)) name_add_on = '' if self.args.exploration_strategy == 'pgg': if self.args.pgg_strategy == 'final': if self.args.replay_strategy == 'future': name_add_on = '_final_distance_based_goal_generation_buffer' + str( buffer_len_epochs) + 'epochs' else: name_add_on = '_final_distance_based_goal_generation_withoutHER_buffer' + str( buffer_len_epochs) + 'epochs' else: if self.args.replay_strategy == 'future': name_add_on = '_distance_based_goal_generation_buffer' + str( buffer_len_epochs) + 'epochs' else: name_add_on = '_distance_based_goal_generation_withoutHER_buffer' + str( buffer_len_epochs) + 'epochs' else: if self.args.replay_strategy == 'future': name_add_on = '_originalHER_buffer' + str( buffer_len_epochs) + 'epochs' else: name_add_on = '_originalDDPG_buffer' + str( buffer_len_epochs) + 'epochs' # path to save the model self.model_path = os.path.join(self.args.save_dir, self.args.env_name + name_add_on) if not os.path.exists(self.model_path): os.mkdir(self.model_path) self.model_path = os.path.join(self.model_path, 'seed_' + str(self.args.seed)) if not os.path.exists(self.model_path): os.mkdir(self.model_path)
def __init__(self, args, envs_lst, env_params, expert_lst_dir, recurrent=True, ee_reward=True, image=True): self.args = args self.envs_lst = envs_lst self.env_params = env_params self.recurrent = recurrent self.ee_reward = ee_reward self.image = image # initialize expert self.expert_lst = [] for dir in expert_lst_dir: expert_load_path = dir + '/model.pt' o_mean, o_std, g_mean, g_std, model = torch.load(expert_load_path) expert_model = actor(env_params, env_params['obs'] + env_params['goal']) expert_model.load_state_dict(model) self.expert_lst.append({ "model": expert_model, "o_mean": o_mean, "o_std": o_std, "g_mean": g_mean, "g_std": g_std }) # create the network if self.recurrent: self.actor_network = actor_recurrent( env_params, env_params['obs'] + env_params['goal'] + env_params['action'], env_params['goal']) # self.critic_network = critic_recurrent(env_params, env_params['obs'] + env_params['goal'] + 2 * env_params['action']) else: self.actor_network = actor( env_params, env_params['obs'] + env_params['goal'] + env_params['action'], env_params['goal']) self.critic_network = critic( env_params, env_params['obs'] + 2 * env_params['goal'] + env_params['action']) # create the normalizer self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.args.clip_range) self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range) self.sg_norm = normalizer(size=env_params['action'], default_clip_range=self.args.clip_range) # load model if load_path is not None if self.args.load_dir != '': load_path = self.args.load_dir + '/model.pt' # o_mean, o_std, g_mean, g_std, sg_mean, sg_std, model = torch.load(load_path) o_mean, o_std, g_mean, g_std, model = torch.load(load_path) self.o_norm.mean = o_mean self.o_norm.std = o_std self.g_norm.mean = g_mean self.g_norm.std = g_std # self.sg_norm.mean = sg_mean # self.sg_norm.std = sg_std self.actor_network.load_state_dict(model) # sync the networks across the cpus sync_networks(self.actor_network) sync_networks(self.critic_network) # build up the target network if self.recurrent: self.actor_target_network = actor_recurrent( env_params, env_params['obs'] + env_params['goal'] + env_params['action'], env_params['goal']) # self.critic_target_network = critic_recurrent(env_params, env_params['obs'] + env_params['goal'] + 2 * env_params['action']) else: self.actor_target_network = actor( env_params, env_params['obs'] + env_params['goal'] + env_params['action'], env_params['goal']) self.critic_target_network = critic( env_params, env_params['obs'] + 2 * env_params['goal'] + env_params['action']) # load the weights into the target networks self.actor_target_network.load_state_dict( self.actor_network.state_dict()) self.critic_target_network.load_state_dict( self.critic_network.state_dict()) # if use gpu if self.args.cuda: self.actor_network.cuda() self.critic_network.cuda() self.actor_target_network.cuda() self.critic_target_network.cuda() # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic) # her sampler self.her_module_lst = [ her_sampler(self.args.replay_strategy, self.args.replay_k, env.compute_reward) for env in self.envs_lst ] # create the replay buffer self.buffer_lst = [ replay_buffer(self.env_params, self.args.buffer_size, her_module.sample_her_transitions, ee_reward=True) for her_module in self.her_module_lst ] # path to save the model self.model_path = os.path.join(self.args.save_dir, self.args.env_name)
def __init__(self, args, env, env_params): self.args = args self.env = env self.env_params = env_params # create the network self.actor_network = actor(env_params) self.critic_network = critic(env_params) # sync the networks across the cpus sync_networks(self.actor_network) sync_networks(self.critic_network) # build up the target network self.actor_target_network = actor(env_params) self.critic_target_network = critic(env_params) # load the weights into the target networks self.actor_target_network.load_state_dict( self.actor_network.state_dict()) self.critic_target_network.load_state_dict( self.critic_network.state_dict()) # if use gpu if self.args.cuda: self.actor_network.cuda() self.critic_network.cuda() self.actor_target_network.cuda() self.critic_target_network.cuda() # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic) # her sampler self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env.compute_reward) # create the replay buffer self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions) # create the normalizer self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.args.clip_range) self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range) muscle_labels = [ "m" + str(i) for i in np.array(range(args.num_muscles)) ] env = PointModel( verbose=0, success_thres=args.success_threshold, dof_observation=args.dob, include_follow=False, port=args.port, muscle_labels=muscle_labels, ) self.env = env # create the dict for store the model if MPI.COMM_WORLD.Get_rank() == 0: if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) # path to save the model self.model_path = os.path.join( self.args.save_dir, self.args.env_name, self.args.exp_name) # self.args.env_name if not os.path.exists(self.model_path): os.mkdir(self.model_path)
def __init__(self, obs_dim, act_dim, env = None, memory_size=50000, batch_size=64,\ lr_critic=1e-4, lr_actor=1e-4, gamma=0.99, tau=0.001, prioritized_replay=True,\ critic_dist_info=None, n_steps=1): self.gamma = gamma self.n_steps = n_steps self.n_step_gamma = self.gamma**self.n_steps self.batch_size = batch_size self.obs_dim = obs_dim self.act_dim = act_dim self.memory_size = memory_size self.tau = tau self.env = env ## critic_dist_info: # dictionary with information about critic output distribution. # parameters: # 1. distribution_type = 'categorical' or 'mixture_of_gaussian' # if 'categorical': # a. # if 'mixture_of_gaussian': # b. self.dist_type = critic_dist_info['type'] if critic_dist_info['type'] == 'categorical': self.v_min = critic_dist_info['v_min'] self.v_max = critic_dist_info['v_max'] self.n_atoms = critic_dist_info['n_atoms'] self.delta = (self.v_max - self.v_min) / float(self.n_atoms - 1) self.bin_centers = np.array([ self.v_min + i * self.delta for i in range(self.n_atoms) ]).reshape(-1, 1) elif critic_dist_info['type'] == 'mixture_of_gaussian': #TODO pass else: print("Error: Unsupported distribution type") # TODO # throw exception # actor self.actor = actor(input_size=obs_dim, output_size=act_dim) self.actor_target = actor(input_size=obs_dim, output_size=act_dim) self.actor_target.load_state_dict(self.actor.state_dict()) # critic self.critic = critic(state_size=obs_dim, action_size=act_dim, dist_info=critic_dist_info) self.critic_target = critic(state_size=obs_dim, action_size=act_dim, dist_info=critic_dist_info) self.critic_target.load_state_dict(self.critic.state_dict()) # optimizers self.optimizer_actor = optim.Adam(self.actor.parameters(), lr=lr_actor) self.optimizer_critic = optim.Adam(self.critic.parameters(), lr=lr_critic) # critic loss self.critic_loss = nn.CrossEntropyLoss() # noise #self.noise = OrnsteinUhlenbeckProcess(dimension=act_dim, num_steps=5000) self.noise = GaussianNoise(dimension=act_dim, num_epochs=5000) # replay buffer self.prioritized_replay = prioritized_replay if self.prioritized_replay: # Open AI prioritized replay memory self.replayBuffer = PrioritizedReplayBuffer(self.memory_size, alpha=0.6) prioritized_replay_beta0 = 0.4 # type: float prioritized_replay_beta_iters = 100000 self.beta_schedule = LinearSchedule(prioritized_replay_beta_iters,\ initial_p=prioritized_replay_beta0,\ final_p=1.0) self.prioritized_replay_eps = 1e-6 else: self.replayBuffer = Replay( self.memory_size, self.env, n_steps=self.n_steps, gamma=self.gamma) #<- self implemented memory buffer