def __init__(self, args, env, env_params): self.args = args self.env = env self.env_params = env_params # create the network self.actor_network = actor(env_params) self.critic_network = critic(env_params) # sync the networks across the cpus sync_networks(self.actor_network) sync_networks(self.critic_network) # build up the target network self.actor_target_network = actor(env_params) self.critic_target_network = critic(env_params) # load the weights into the target networks self.actor_target_network.load_state_dict( self.actor_network.state_dict()) self.critic_target_network.load_state_dict( self.critic_network.state_dict()) # if use gpu if self.args.cuda: self.actor_network.cuda() self.critic_network.cuda() self.actor_target_network.cuda() self.critic_target_network.cuda() # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic) # her sampler self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env.compute_reward) # create the replay buffer self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions) self.buffer2 = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_transitions) # create the normalizer self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.args.clip_range) self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range) # create the dict for store the model if MPI.COMM_WORLD.Get_rank() == 0: if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) # path to save the model self.model_path = os.path.join(self.args.save_dir, self.args.env_name) if not os.path.exists(self.model_path): os.mkdir(self.model_path)
def __init__(self, env, capacity, update_freq, episode, feature_dim, k_dim, dilation, horizon_c, learning_rate, alpha, gamma, entropy_weight, render): # * feature_dim >> k_dim # * dilation == horizon_c # * capacity <= update_freq self.env = env self.capacity = capacity self.update_freq = update_freq self.episode = episode self.feature_dim = feature_dim self.k_dim = k_dim self.dilation = dilation self.horizon_c = horizon_c self.learning_rate = learning_rate self.alpha = alpha self.gamma = gamma self.entropy_weight = entropy_weight self.render = render self.observation_dim = self.env.observation_space.shape[0] self.action_dim = self.env.action_space.n self.net = feudal_networks(self.observation_dim, self.feature_dim, self.k_dim, self.action_dim, self.dilation, self.horizon_c) self.buffer = replay_buffer(self.capacity) self.optimizer = torch.optim.Adam(self.net.parameters(), lr=self.learning_rate) self.h_m = torch.zeros([1, self.feature_dim]) self.c_m = torch.zeros([1, self.feature_dim]) self.h_w = torch.zeros([1, self.action_dim * self.k_dim]) self.c_w = torch.zeros([1, self.action_dim * self.k_dim]) self.count = 0 self.weight_reward = None
def __init__(self, global_net, optimizer, global_episode_counter, global_reward, res_queue, name, max_episode, gamma, env_id, capacity, train_freq, n_step, stack_num, pc_weight, rp_weight, vr_weight, batch_size, observation_dim, entropy_weight): super(worker, self).__init__() self.name = 'w' + name self.global_episode_counter = global_episode_counter self.global_reward = global_reward self.res_queue = res_queue self.global_net = global_net self.optimizer = optimizer self.max_episode = max_episode self.gamma = gamma self.env_id = env_id self.env = gym.make(env_id) self.env = self.env.unwrapped self.action_dim = self.env.action_space.n self.observation_dim = observation_dim self.entropy_weight = entropy_weight self.local_net = unreal(self.observation_dim, self.action_dim, self.gamma, self.entropy_weight) self.capacity = capacity self.train_freq = train_freq self.n_step = n_step self.stack_num = stack_num self.pc_weight = pc_weight self.rp_weight = rp_weight self.vr_weight = vr_weight self.batch_size = batch_size self.buffer = replay_buffer(self.capacity, self.train_freq, self.n_step, self.stack_num, self.gamma) self.last_action = torch.zeros(1, self.action_dim) self.last_reward = torch.zeros(1, 1)
def __init__(self, alpha=0.001, beta=0.002, inp_dims=[8], env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, fcl1=512, fcl2=512, batch_size=64): self.gamma = gamma self.tau = tau self.mem = replay_buffer(max_size, inp_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.max_action = env.action_space.high[0] self.min_action = env.action_space.low[0] self.actor = actor(n_actions=n_actions, name="Actor") self.critic = critic(n_actions=n_actions, name="Critic") self.actor_target = actor(n_actions=n_actions, name="Target_actor") self.critic_target = critic(n_actions=n_actions, name="Target_critic") self.actor.compile(optimizer=Adam(learning_rate=alpha) ) #learning rate is given for formality to compile self.critic.compile(optimizer=Adam(learning_rate=beta)) self.actor_target.compile(optimizer=Adam(learning_rate=alpha)) self.critic_target.compile(optimizer=Adam(learning_rate=beta)) self.noise = ou_action_noise(mu=np.zeros(n_actions)) self.update_net_params(tau=1)
def main(): initial_exploration = 2000 ## 2000개의 memory가 쌓이고 나서 학습 시작! print_interval = 100 ## 몇 episode 마다 log 출력할건지. maximum_steps = 300 ## infinite task라 일정 시점에서 게임을 종료 시켜줘야함. episode = 3000 # episode 정해주기. action_space = 2 state_space = 4 env = gym.make('CartPole-v1') multi_step_size = 10 # 1 = 1 step-TD and TD(0), 2 = 2 step-TD, 3 = 3 step-TD DDQN = Double_DQN.double_dqn(state_space, action_space, multi_step_size) buffer_size = 100000 # replay buffer_size batch_size = 32 # batch size replay_buffer = buf.replay_buffer(buffer_size, multi_step_size, batch_size) step = 0 ## 총 step을 계산하기 위한 step. score = 0 show_score = [] for epi in range(episode): obs = env.reset() ## 환경 초기화 for i in range(maximum_steps): action = DDQN.action_policy(torch.Tensor(obs)) next_obs, reward, done, _ = env.step( action) ## _ 가 원래 info 정보를 가지고 있는데, 학습에 필요치 않음. mask = 0 if done else 1 ## 게임이 종료됬으면, done이 1이면 mask =0 그냥 생존유무 표시용. replay_buffer.store((obs, action, reward, next_obs, mask)) # repaly buffer에 data 저장. obs = next_obs ## current state를 이제 next_state로 변경 score += reward ## reward 갱신. step += 1 if step > initial_exploration: ## 초기 exploration step을 넘어서면 학습 시작. random_mini_batch, random_mini_batch_next, index, buffer = replay_buffer.make_batch( ) # random batch sampling. DDQN.train(random_mini_batch, random_mini_batch_next, index, buffer) if done: ## 죽었다면 게임 초기화를 위한 반복문 탈출 break if epi % print_interval == 0 and epi != 0: show_score.append(score / print_interval) ## reward score 저장. print('episode: ', epi, ' step: ', step, ' epsilon: ', DDQN.print_eps(), ' score: ', score / print_interval) # log 출력. score = 0 with open('10step_ddqn_dueling.p', 'wb') as file: pickle.dump(show_score, file) env.close()
def __init__(self, env, episode, capacity, gamma, lam, is_disc, value_learning_rate, policy_learning_rate, discriminator_learning_rate, batch_size, file, policy_iter, disc_iter, value_iter, epsilon, entropy_weight, train_iter, clip_grad, render): self.env = env self.episode = episode self.capacity = capacity self.gamma = gamma self.lam = lam self.is_disc = is_disc self.value_learning_rate = value_learning_rate self.policy_learning_rate = policy_learning_rate self.discriminator_learning_rate = discriminator_learning_rate self.batch_size = batch_size self.file = file self.policy_iter = policy_iter self.disc_iter = disc_iter self.value_iter = value_iter self.epsilon = epsilon self.entropy_weight = entropy_weight self.train_iter = train_iter self.clip_grad = clip_grad self.render = render self.observation_dim = self.env.observation_space.shape[0] if is_disc: self.action_dim = self.env.action_space.n else: self.action_dim = self.env.action_space.shape[0] if is_disc: self.policy_net = disc_policy_net(self.observation_dim, self.action_dim) else: self.policy_net = cont_policy_net(self.observation_dim, self.action_dim) self.value_net = value_net(self.observation_dim, 1) self.discriminator = discriminator(self.observation_dim + self.action_dim) self.buffer = replay_buffer(self.capacity, self.gamma, self.lam) self.pool = pickle.load(self.file) self.policy_optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=self.policy_learning_rate) self.value_optimizer = torch.optim.Adam(self.value_net.parameters(), lr=self.value_learning_rate) self.discriminator_optimizer = torch.optim.Adam( self.discriminator.parameters(), lr=self.discriminator_learning_rate) self.disc_loss_func = nn.BCELoss() self.weight_reward = None self.weight_custom_reward = None
def __init__(self, envs, testing_envs, seed, variance_limit = 0.25): self.seed = seed self.successes = [] self.testing_envs = testing_envs self.envs = envs self.variance_limit = variance_limit training_envs_per_dof = int(len(self.envs.envs)/3) self.training_env_seq = [4]*training_envs_per_dof + [5]*training_envs_per_dof + [6]*training_envs_per_dof self.testing_env_seq = [4]*10 + [5]*10 + [6]*10 if p.mode == "retrain": self.training_env_seq = self.testing_env_seq self.device = torch.device(p.device) # create the network self.actor = Actor().to(self.device) self.critic = Critic().to(self.device) if p.mode == 'retrain': self.actor.load_state_dict(torch.load("actor_seed_{}".format(seed))) self.critic.load_state_dict(torch.load("critic_seed_{}".format(seed))) # build up the target network self.actor_target = Actor().to(self.device) self.critic_target = Critic().to(self.device) # load the weights into the target networks self.actor_target.load_state_dict(self.actor.state_dict()) self.critic_target.load_state_dict(self.critic.state_dict()) # if use gpu self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=p.lr) self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=p.lr) # her sampler self.buffer = replay_buffer(seed) if p.mode == 'retrain': self.buffer.load_normalizers() print("loading done") self.training_data, self.testing_data = {}, {} for env in self.envs.envs: self.training_data[env.name] = [] for env in self.testing_envs.envs: self.testing_data[env.name] = [] try: os.mkdir("Generated_data") except FileExistsError: pass
def __init__(self, args, env, env_params, image=True): self.args = args self.env = env self.env_params = env_params self.image = image # create the network if self.image: self.actor_network = actor_image(env_params, env_params['obs']) self.critic_network = critic_image(env_params, env_params['obs'] + env_params['action']) else: self.actor_network = actor(env_params, env_params['obs']) self.critic_network = critic(env_params, env_params['obs'] + env_params['action']) # load model if load_path is not None if self.args.load_dir != '': actor_load_path = self.args.load_dir + '/actor.pt' model = torch.load(actor_load_path) self.actor_network.load_state_dict(model) critic_load_path = self.args.load_dir + '/critic.pt' model = torch.load(critic_load_path) self.critic_network.load_state_dict(model) # sync the networks across the cpus # sync_networks(self.actor_network) # sync_networks(self.critic_network) # build up the target network # if self.image: # self.actor_target_network = actor_image(env_params, env_params['obs']) # else: # self.actor_target_network = actor(env_params, env_params['obs']) # # load the weights into the target networks # self.actor_target_network.load_state_dict(self.actor_network.state_dict()) # if use gpu if self.args.cuda: self.actor_network.cuda() # self.actor_target_network.cuda() self.critic_network.cuda() # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic) # her sampler self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env().compute_reward) # create the replay buffer self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions, image=self.image) # path to save the model self.model_path = os.path.join(self.args.save_dir, self.args.env_name)
def __init__(self, alpha=0.0003, beta=0.0003, inp_dims=[8], env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, l1_size=256, l2_size=256, batch_size=256, reward_scale=2): self.tau = tau self.gamma = gamma self.mem = replay_buffer(max_size, inp_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = actor(alpha, inp_dims, n_actions=n_actions, name="Actor", max_act=env.action_space.high) self.critic1 = critic(beta, inp_dims, n_actions=n_actions, name="Critic1") self.critic2 = critic(beta, inp_dims, n_actions=n_actions, name="Critic2") self.value = value(beta, inp_dims, name="Value") self.target_val = value(beta, inp_dims, name="Target_value") self.scale = reward_scale self.update_net_params(tau=1)
def __init__(self, n_actions, path = None): self.dqn = DQN(n_actions, N_ATOMS) self.dqn.to(device) # loading the network if path is not None: self.dqn.load_state_dict(torch.load(path)) self.dqn_target = DQN(n_actions, N_ATOMS) self.dqn_target.to(device) self.dqn_target.load_state_dict(self.dqn.state_dict()) self.optimizer = optim.Adam(self.dqn.parameters(), lr = LEARNING_RATE, eps = ADAM_EPS) self.buf = replay_buffer(REPLAY_BUFFER_SIZE, N_STEPS) # the distribution for terminal states self.dist_zero = torch.zeros(N_ATOMS, device = device) self.dist_zero[N_ATOMS // 2] = 1.0 self.atoms = torch.linspace(V_MIN, V_MAX, steps = N_ATOMS, device = device) self.delta_z = (V_MAX - V_MIN)/(N_ATOMS - 1)
def __init__(self, env, episode, exploration, update_freq, freeze_interval, batch_size, capacity, learning_rate, option_num, gamma, termination_reg, epsilon_init, decay, epsilon_min, entropy_weight, conv, cuda, render, save_path=None): self.env = env self.episode = episode self.exploration = exploration self.update_freq = update_freq self.freeze_interval = freeze_interval self.batch_size = batch_size self.capacity = capacity self.learning_rate = learning_rate self.option_num = option_num self.gamma = gamma self.termination_reg = termination_reg self.epsilon_init = epsilon_init self.decay = decay self.epsilon_min = epsilon_min self.entropy_weight = entropy_weight self.conv = conv self.cuda = cuda self.render = render self.save_path = save_path if not self.conv: self.observation_dim = self.env.observation_space.shape[0] else: self.observation_dim = self.env.observation_space.shape self.action_dim = self.env.action_space.n self.epsilon = lambda x: self.epsilon_min + (self.epsilon_init - self.epsilon_min) * math.exp(- x / self.decay) self.net = opt_cri_arch(self.observation_dim, self.action_dim, self.option_num, self.conv) self.prime_net = opt_cri_arch(self.observation_dim, self.action_dim, self.option_num, self.conv) if self.cuda: self.net = self.net.cuda() self.prime_net = self.prime_net.cuda() self.prime_net.load_state_dict(self.net.state_dict()) self.optimizer = torch.optim.Adam(self.net.parameters(), lr=self.learning_rate) self.buffer = replay_buffer(self.capacity) self.count = 0 self.weight_reward = None
def __init__(self, env, seed): self.seed = seed self.successes = [] self.epochs = [] self.env = env self.device = torch.device(p.device) # create the network self.actor = Actor(self.env.ob_shape, self.env.goal_shape, self.env.action_shape).to(self.device) self.critic = Critic(self.env.ob_shape, self.env.goal_shape, self.env.action_shape).to(self.device) # build up the target network self.actor_target = Actor(self.env.ob_shape, self.env.goal_shape, self.env.action_shape).to(self.device) self.critic_target = Critic(self.env.ob_shape, self.env.goal_shape, self.env.action_shape).to(self.device) # load the weights into the target networks self.actor_target.load_state_dict(self.actor.state_dict()) self.critic_target.load_state_dict(self.critic.state_dict()) # if use gpu self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=p.lr) self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=p.lr) # her sampler self.buffer = replay_buffer(self.env.ob_shape, self.env.action_shape)
def __init__(self, args, env, env_params): self.savetime = 0 self.args = args self.env = env self.env_params = env_params # create the network self.actor_network = actor(env_params) self.critic_network = critic(env_params) # sync the networks across the cpus sync_networks(self.actor_network) sync_networks(self.critic_network) # build up the target network self.actor_target_network = actor(env_params) self.critic_target_network = critic(env_params) # load the weights into the target networks self.actor_target_network.load_state_dict( self.actor_network.state_dict()) self.critic_target_network.load_state_dict( self.critic_network.state_dict()) # if use gpu if self.args.cuda: self.actor_network.cuda() self.critic_network.cuda() self.actor_target_network.cuda() self.critic_target_network.cuda() # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic) # her sampler self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env.compute_reward) # create the replay buffer self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions) # 是否加入示教数据 if self.args.add_demo: self._init_demo_buffer( ) # initialize replay buffer with demonstration # create the normalizer self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.args.clip_range) self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range) # load the data to continue the training # model_path = "saved_models/bmirobot-v3/125_True12_model.pt" # # # model_path = args.save_dir + args.env_name + '/' + str(args.seed) + '_' + str(args.add_demo) + '_model.pt' # # o_mean, o_std, g_mean, g_std, model = torch.load(model_path, map_location=lambda storage, loc: storage) # self.actor_network.load_state_dict(model) # self.o_norm.mean=o_mean # self.o_norm.std=o_std # self.g_norm.mean=g_mean # self.g_norm.std=g_std self.success_rates = [] # 记录每个epoch的成功率 # create the dict for store the model if MPI.COMM_WORLD.Get_rank() == 0: if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) # path to save the model self.model_path = os.path.join(self.args.save_dir, self.args.env_name) if not os.path.exists(self.model_path): os.mkdir(self.model_path)
def __init__(self, args, envs_lst, env_params, expert_lst_dir, recurrent=True, ee_reward=True, image=True): self.args = args self.envs_lst = envs_lst self.env_params = env_params self.recurrent = recurrent self.ee_reward = ee_reward self.image = image # initialize expert self.expert_lst = [] for dir in expert_lst_dir: expert_load_path = dir + '/model.pt' o_mean, o_std, g_mean, g_std, model = torch.load(expert_load_path) expert_model = actor(env_params, env_params['obs'] + env_params['goal']) expert_model.load_state_dict(model) self.expert_lst.append({ "model": expert_model, "o_mean": o_mean, "o_std": o_std, "g_mean": g_mean, "g_std": g_std }) # create the network if self.recurrent: self.actor_network = actor_recurrent( env_params, env_params['obs'] + env_params['goal'] + env_params['action'], env_params['goal']) # self.critic_network = critic_recurrent(env_params, env_params['obs'] + env_params['goal'] + 2 * env_params['action']) else: self.actor_network = actor( env_params, env_params['obs'] + env_params['goal'] + env_params['action'], env_params['goal']) self.critic_network = critic( env_params, env_params['obs'] + 2 * env_params['goal'] + env_params['action']) # create the normalizer self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.args.clip_range) self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range) self.sg_norm = normalizer(size=env_params['action'], default_clip_range=self.args.clip_range) # load model if load_path is not None if self.args.load_dir != '': load_path = self.args.load_dir + '/model.pt' # o_mean, o_std, g_mean, g_std, sg_mean, sg_std, model = torch.load(load_path) o_mean, o_std, g_mean, g_std, model = torch.load(load_path) self.o_norm.mean = o_mean self.o_norm.std = o_std self.g_norm.mean = g_mean self.g_norm.std = g_std # self.sg_norm.mean = sg_mean # self.sg_norm.std = sg_std self.actor_network.load_state_dict(model) # sync the networks across the cpus sync_networks(self.actor_network) sync_networks(self.critic_network) # build up the target network if self.recurrent: self.actor_target_network = actor_recurrent( env_params, env_params['obs'] + env_params['goal'] + env_params['action'], env_params['goal']) # self.critic_target_network = critic_recurrent(env_params, env_params['obs'] + env_params['goal'] + 2 * env_params['action']) else: self.actor_target_network = actor( env_params, env_params['obs'] + env_params['goal'] + env_params['action'], env_params['goal']) self.critic_target_network = critic( env_params, env_params['obs'] + 2 * env_params['goal'] + env_params['action']) # load the weights into the target networks self.actor_target_network.load_state_dict( self.actor_network.state_dict()) self.critic_target_network.load_state_dict( self.critic_network.state_dict()) # if use gpu if self.args.cuda: self.actor_network.cuda() self.critic_network.cuda() self.actor_target_network.cuda() self.critic_target_network.cuda() # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic) # her sampler self.her_module_lst = [ her_sampler(self.args.replay_strategy, self.args.replay_k, env.compute_reward) for env in self.envs_lst ] # create the replay buffer self.buffer_lst = [ replay_buffer(self.env_params, self.args.buffer_size, her_module.sample_her_transitions, ee_reward=True) for her_module in self.her_module_lst ] # path to save the model self.model_path = os.path.join(self.args.save_dir, self.args.env_name)
EVAL_EPISODE = 10 # Perform evaluation each EVAL_EPISODE number of episodes NUM_STEPS = 200 NUM_EVALUATIONS = 10 # Number of times we run evaluation INITIAL_EPSILON = 0.9 MIN_EPSILON = 0.1 EPSILON_DECAY = 0.999 BUFFER_SIZE = 100000 env = gym.make('CartPole-v0') ndim_action = env.action_space.n ndim_obs = env.observation_space.shape[0] epsilon = INITIAL_EPSILON agent = dqn(ndim_obs, ndim_action) replay = replay_buffer(BUFFER_SIZE) for i in range(NUM_EPISODES): # Training obs = env.reset() for j in range(NUM_STEPS): # Epsilon greedy exploration if np.random.uniform() < epsilon: action = env.action_space.sample() else: action = agent.act(np.expand_dims(obs, 0))[0] # Perform an action, observe next state and reward newobs, reward, done, info = env.step(action) # Insert it to replay buffer
def __init__(self, args, env, env_params): self.args = args self.env = env self.env_params = env_params # create the network self.actor_network = actor(env_params) self.critic_network = critic(env_params) # sync the networks across the cpus #sync_networks(self.actor_network) #sync_networks(self.critic_network) # build up the target network self.actor_target_network = actor(env_params) self.critic_target_network = critic(env_params) # Load the model if required if args.load_path != None: o_mean, o_std, g_mean, g_std, load_actor_model, load_critic_model = torch.load( args.load_path, map_location=lambda storage, loc: storage) self.actor_network.load_state_dict(load_actor_model) self.critic_network.load_state_dict(load_critic_model) # load the weights into the target networks self.actor_target_network.load_state_dict( self.actor_network.state_dict()) self.critic_target_network.load_state_dict( self.critic_network.state_dict()) # if use gpu if self.args.cuda: self.actor_network.cuda() self.critic_network.cuda() self.actor_target_network.cuda() self.critic_target_network.cuda() # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic) # her sampler self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env.compute_reward) # create the replay buffer self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions) # create the normalizer self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.args.clip_range) self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range) # create the dict for store the model #if MPI.COMM_WORLD.Get_rank() == 0: if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) # makeup a suffix for the model path to indicate which method is used for Training #self.folder_siffix = '_' + self.args.replay_strategy + '_' + self.args.env_params.reward_type # path to save the model self.model_path = os.path.join(self.args.save_dir, self.args.env_name) if not os.path.exists(self.model_path): os.mkdir(self.model_path) self.model_path = os.path.join(self.model_path, 'seed_' + str(self.args.seed)) if not os.path.exists(self.model_path): os.mkdir(self.model_path)
def __init__(self, alpha, beta, inp_dims, tau, env, gamma=0.99, update_actor_int=2, warmup=1000, n_actions=2, max_size=1000000, l1_size=400, l2_size=300, batch_size=100, noise=0.1): self.tau = tau self.gamma = gamma self.min_act = env.action_space.low self.max_act = env.action_space.high self.mem = replay_buffer(max_size, inp_dims, n_actions) self.batch_size = batch_size self.lrn_step_count = 0 self.warmup = warmup self.n_actions = n_actions self.time_step = 0 self.update_act_iter = update_actor_int self.actor = actor(alpha, inp_dims, l1_size, l2_size, n_actions=n_actions, name="Actor") self.critic1 = critic(beta, inp_dims, l1_size, l2_size, n_actions=n_actions, name="Critic1") self.critic2 = critic(beta, inp_dims, l1_size, l2_size, n_actions=n_actions, name="Critic2") self.actor_target = actor(alpha, inp_dims, l1_size, l2_size, n_actions=n_actions, name="Actor_target") self.critic_target1 = critic(beta, inp_dims, l1_size, l2_size, n_actions=n_actions, name="Critic_target1") self.critic_target2 = critic(beta, inp_dims, l1_size, l2_size, n_actions=n_actions, name="Critic_target2") self.noise = noise self.update_net_params(tau=1)
BATCH_SIZE = 32 INITIAL_SIZE = 50000 CAPACITY_SIZE = 1000000 GAMMA = 0.99 ALPHA = 0.0001 C = 10000 NUM_SKIP = 4 N_ACTIONS = env.action_space.n STATE_DIM = env.observation_space.shape[0] HEIGHT = 28 WIDTH = 28 USE_GPU = True # Initialize experience replay buffer buffer = replay_buffer(CAPACITY_SIZE) if USE_GPU: device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") else: device = torch.device("cpu") print(device) Q = DQN(state_dim=STATE_DIM, num_action=N_ACTIONS, alpha=ALPHA, C=C, learning_start=INITIAL_SIZE, learningfreq=NUM_SKIP, height=HEIGHT,
def __init__(self, args, env, env_params): self.args = args self.env = env self.env_params = env_params # create the network self.actor_network = actor(env_params) self.critic_network = critic(env_params) # build up the target network self.actor_target_network = actor(env_params) self.critic_target_network = critic(env_params) # Load the model if required if args.load_path != None: o_mean, o_std, g_mean, g_std, load_actor_model, load_critic_model = torch.load( args.load_path, map_location=lambda storage, loc: storage) self.actor_network.load_state_dict(load_actor_model) self.critic_network.load_state_dict(load_critic_model) # load the weights into the target networks self.actor_target_network.load_state_dict( self.actor_network.state_dict()) self.critic_target_network.load_state_dict( self.critic_network.state_dict()) # if use gpu if self.args.cuda: self.actor_network.cuda() self.critic_network.cuda() self.actor_target_network.cuda() self.critic_target_network.cuda() # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic) # her sampler self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env.compute_reward) # create the replay buffer if self.args.replay_strategy == 'future': self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions) else: self.buffer = replay_buffer( self.env_params, self.args.buffer_size, self.her_module.sample_normal_transitions) # create the normalizer self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.args.clip_range) self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range) # create the dict for store the model if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) # makeup a suffix for the model path to indicate which method is used for Training buffer_len_epochs = int( self.args.buffer_size / (env_params['max_timesteps'] * self.args.num_rollouts_per_cycle * self.args.n_cycles)) name_add_on = '' if self.args.exploration_strategy == 'pgg': if self.args.pgg_strategy == 'final': if self.args.replay_strategy == 'future': name_add_on = '_final_distance_based_goal_generation_buffer' + str( buffer_len_epochs) + 'epochs' else: name_add_on = '_final_distance_based_goal_generation_withoutHER_buffer' + str( buffer_len_epochs) + 'epochs' else: if self.args.replay_strategy == 'future': name_add_on = '_distance_based_goal_generation_buffer' + str( buffer_len_epochs) + 'epochs' else: name_add_on = '_distance_based_goal_generation_withoutHER_buffer' + str( buffer_len_epochs) + 'epochs' else: if self.args.replay_strategy == 'future': name_add_on = '_originalHER_buffer' + str( buffer_len_epochs) + 'epochs' else: name_add_on = '_originalDDPG_buffer' + str( buffer_len_epochs) + 'epochs' # path to save the model self.model_path = os.path.join(self.args.save_dir, self.args.env_name + name_add_on) if not os.path.exists(self.model_path): os.mkdir(self.model_path) self.model_path = os.path.join(self.model_path, 'seed_' + str(self.args.seed)) if not os.path.exists(self.model_path): os.mkdir(self.model_path)
capacity = 10000 exploration = 200 epsilon_init = 0.99 epsilon_min = 0.2 decay = 0.998 episode = 1000000 render = False threshold_reward = 80 env_list = [reverse_observation_wrapper(gym.make('CartPole-v0')), gym.make('CartPole-v0'), reverse_action_wrapper(reverse_observation_wrapper(gym.make('CartPole-v0'))), reverse_action_wrapper(gym.make('CartPole-v0'))] observation_dim = env_list[0].observation_space.shape[0] action_dim = env_list[0].action_space.n target_net = Progressive_neural_net(3) eval_net = Progressive_neural_net(3) eval_net.load_state_dict(target_net.state_dict()) buffer = replay_buffer(capacity) loss_fn = nn.MSELoss() sizes = [observation_dim, 64, 32, action_dim] for env_idx, env in enumerate(env_list): count = 0 epsilon = epsilon_init eval_net.add_new_column(sizes) target_net.add_new_column(sizes) target_net.load_state_dict(eval_net.state_dict()) optimizer = torch.optim.Adam(eval_net.parameters(env_idx), lr=learning_rate) weight_reward = None for i in range(episode): obs = env.reset() if epsilon > epsilon_min: epsilon = epsilon * decay