Beispiel #1
0
 def __init__(self, args, env, env_params):
     self.args = args
     self.env = env
     self.env_params = env_params
     # create the network
     self.actor_network = actor(env_params)
     self.critic_network = critic(env_params)
     # sync the networks across the cpus
     sync_networks(self.actor_network)
     sync_networks(self.critic_network)
     # build up the target network
     self.actor_target_network = actor(env_params)
     self.critic_target_network = critic(env_params)
     # load the weights into the target networks
     self.actor_target_network.load_state_dict(
         self.actor_network.state_dict())
     self.critic_target_network.load_state_dict(
         self.critic_network.state_dict())
     # if use gpu
     if self.args.cuda:
         self.actor_network.cuda()
         self.critic_network.cuda()
         self.actor_target_network.cuda()
         self.critic_target_network.cuda()
     # create the optimizer
     self.actor_optim = torch.optim.Adam(self.actor_network.parameters(),
                                         lr=self.args.lr_actor)
     self.critic_optim = torch.optim.Adam(self.critic_network.parameters(),
                                          lr=self.args.lr_critic)
     # her sampler
     self.her_module = her_sampler(self.args.replay_strategy,
                                   self.args.replay_k,
                                   self.env.compute_reward)
     # create the replay buffer
     self.buffer = replay_buffer(self.env_params, self.args.buffer_size,
                                 self.her_module.sample_her_transitions)
     self.buffer2 = replay_buffer(self.env_params, self.args.buffer_size,
                                  self.her_module.sample_transitions)
     # create the normalizer
     self.o_norm = normalizer(size=env_params['obs'],
                              default_clip_range=self.args.clip_range)
     self.g_norm = normalizer(size=env_params['goal'],
                              default_clip_range=self.args.clip_range)
     # create the dict for store the model
     if MPI.COMM_WORLD.Get_rank() == 0:
         if not os.path.exists(self.args.save_dir):
             os.mkdir(self.args.save_dir)
         # path to save the model
         self.model_path = os.path.join(self.args.save_dir,
                                        self.args.env_name)
         if not os.path.exists(self.model_path):
             os.mkdir(self.model_path)
    def __init__(self, env, capacity, update_freq, episode, feature_dim, k_dim, dilation, horizon_c, learning_rate, alpha, gamma, entropy_weight, render):
        # * feature_dim >> k_dim
        # * dilation == horizon_c
        # * capacity <= update_freq
        self.env = env
        self.capacity = capacity
        self.update_freq = update_freq
        self.episode = episode
        self.feature_dim = feature_dim
        self.k_dim = k_dim
        self.dilation = dilation
        self.horizon_c = horizon_c
        self.learning_rate = learning_rate
        self.alpha = alpha
        self.gamma = gamma
        self.entropy_weight = entropy_weight
        self.render = render

        self.observation_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.n
        self.net = feudal_networks(self.observation_dim, self.feature_dim, self.k_dim, self.action_dim, self.dilation, self.horizon_c)
        self.buffer = replay_buffer(self.capacity)
        self.optimizer = torch.optim.Adam(self.net.parameters(), lr=self.learning_rate)
        self.h_m = torch.zeros([1, self.feature_dim])
        self.c_m = torch.zeros([1, self.feature_dim])
        self.h_w = torch.zeros([1, self.action_dim * self.k_dim])
        self.c_w = torch.zeros([1, self.action_dim * self.k_dim])
        self.count = 0
        self.weight_reward = None
Beispiel #3
0
    def __init__(self, global_net, optimizer, global_episode_counter,
                 global_reward, res_queue, name, max_episode, gamma, env_id,
                 capacity, train_freq, n_step, stack_num, pc_weight, rp_weight,
                 vr_weight, batch_size, observation_dim, entropy_weight):
        super(worker, self).__init__()
        self.name = 'w' + name
        self.global_episode_counter = global_episode_counter
        self.global_reward = global_reward
        self.res_queue = res_queue
        self.global_net = global_net
        self.optimizer = optimizer
        self.max_episode = max_episode
        self.gamma = gamma
        self.env_id = env_id
        self.env = gym.make(env_id)
        self.env = self.env.unwrapped
        self.action_dim = self.env.action_space.n
        self.observation_dim = observation_dim
        self.entropy_weight = entropy_weight
        self.local_net = unreal(self.observation_dim, self.action_dim,
                                self.gamma, self.entropy_weight)
        self.capacity = capacity
        self.train_freq = train_freq
        self.n_step = n_step
        self.stack_num = stack_num
        self.pc_weight = pc_weight
        self.rp_weight = rp_weight
        self.vr_weight = vr_weight
        self.batch_size = batch_size

        self.buffer = replay_buffer(self.capacity, self.train_freq,
                                    self.n_step, self.stack_num, self.gamma)
        self.last_action = torch.zeros(1, self.action_dim)
        self.last_reward = torch.zeros(1, 1)
Beispiel #4
0
    def __init__(self,
                 alpha=0.001,
                 beta=0.002,
                 inp_dims=[8],
                 env=None,
                 gamma=0.99,
                 n_actions=2,
                 max_size=1000000,
                 tau=0.005,
                 fcl1=512,
                 fcl2=512,
                 batch_size=64):

        self.gamma = gamma
        self.tau = tau
        self.mem = replay_buffer(max_size, inp_dims, n_actions)
        self.batch_size = batch_size
        self.n_actions = n_actions
        self.max_action = env.action_space.high[0]
        self.min_action = env.action_space.low[0]
        self.actor = actor(n_actions=n_actions, name="Actor")
        self.critic = critic(n_actions=n_actions, name="Critic")

        self.actor_target = actor(n_actions=n_actions, name="Target_actor")
        self.critic_target = critic(n_actions=n_actions, name="Target_critic")

        self.actor.compile(optimizer=Adam(learning_rate=alpha)
                           )  #learning rate is given for formality to compile
        self.critic.compile(optimizer=Adam(learning_rate=beta))
        self.actor_target.compile(optimizer=Adam(learning_rate=alpha))
        self.critic_target.compile(optimizer=Adam(learning_rate=beta))

        self.noise = ou_action_noise(mu=np.zeros(n_actions))

        self.update_net_params(tau=1)
Beispiel #5
0
def main():
    initial_exploration = 2000  ## 2000개의 memory가 쌓이고 나서 학습 시작!
    print_interval = 100  ## 몇 episode 마다 log 출력할건지.

    maximum_steps = 300  ## infinite task라 일정 시점에서 게임을 종료 시켜줘야함.
    episode = 3000  # episode 정해주기.

    action_space = 2
    state_space = 4
    env = gym.make('CartPole-v1')

    multi_step_size = 10  # 1 = 1 step-TD and TD(0), 2 = 2 step-TD, 3 = 3 step-TD
    DDQN = Double_DQN.double_dqn(state_space, action_space, multi_step_size)

    buffer_size = 100000  # replay buffer_size
    batch_size = 32  # batch size
    replay_buffer = buf.replay_buffer(buffer_size, multi_step_size, batch_size)
    step = 0  ## 총 step을 계산하기 위한 step.
    score = 0
    show_score = []

    for epi in range(episode):
        obs = env.reset()  ## 환경 초기화
        for i in range(maximum_steps):

            action = DDQN.action_policy(torch.Tensor(obs))
            next_obs, reward, done, _ = env.step(
                action)  ## _ 가 원래 info 정보를 가지고 있는데, 학습에 필요치 않음.

            mask = 0 if done else 1  ## 게임이 종료됬으면, done이 1이면 mask =0 그냥 생존유무 표시용.

            replay_buffer.store((obs, action, reward, next_obs,
                                 mask))  # repaly buffer에 data 저장.

            obs = next_obs  ## current state를 이제 next_state로 변경
            score += reward  ## reward 갱신.
            step += 1

            if step > initial_exploration:  ## 초기 exploration step을 넘어서면 학습 시작.
                random_mini_batch, random_mini_batch_next, index, buffer = replay_buffer.make_batch(
                )  # random batch sampling.
                DDQN.train(random_mini_batch, random_mini_batch_next, index,
                           buffer)

            if done:  ## 죽었다면 게임 초기화를 위한 반복문 탈출
                break

        if epi % print_interval == 0 and epi != 0:
            show_score.append(score / print_interval)  ## reward score 저장.
            print('episode: ', epi, ' step: ', step, ' epsilon: ',
                  DDQN.print_eps(), ' score: ',
                  score / print_interval)  # log 출력.
            score = 0
            with open('10step_ddqn_dueling.p', 'wb') as file:
                pickle.dump(show_score, file)

    env.close()
Beispiel #6
0
    def __init__(self, env, episode, capacity, gamma, lam, is_disc,
                 value_learning_rate, policy_learning_rate,
                 discriminator_learning_rate, batch_size, file, policy_iter,
                 disc_iter, value_iter, epsilon, entropy_weight, train_iter,
                 clip_grad, render):
        self.env = env
        self.episode = episode
        self.capacity = capacity
        self.gamma = gamma
        self.lam = lam
        self.is_disc = is_disc
        self.value_learning_rate = value_learning_rate
        self.policy_learning_rate = policy_learning_rate
        self.discriminator_learning_rate = discriminator_learning_rate
        self.batch_size = batch_size
        self.file = file
        self.policy_iter = policy_iter
        self.disc_iter = disc_iter
        self.value_iter = value_iter
        self.epsilon = epsilon
        self.entropy_weight = entropy_weight
        self.train_iter = train_iter
        self.clip_grad = clip_grad
        self.render = render

        self.observation_dim = self.env.observation_space.shape[0]
        if is_disc:
            self.action_dim = self.env.action_space.n
        else:
            self.action_dim = self.env.action_space.shape[0]
        if is_disc:
            self.policy_net = disc_policy_net(self.observation_dim,
                                              self.action_dim)
        else:
            self.policy_net = cont_policy_net(self.observation_dim,
                                              self.action_dim)
        self.value_net = value_net(self.observation_dim, 1)
        self.discriminator = discriminator(self.observation_dim +
                                           self.action_dim)
        self.buffer = replay_buffer(self.capacity, self.gamma, self.lam)
        self.pool = pickle.load(self.file)
        self.policy_optimizer = torch.optim.Adam(self.policy_net.parameters(),
                                                 lr=self.policy_learning_rate)
        self.value_optimizer = torch.optim.Adam(self.value_net.parameters(),
                                                lr=self.value_learning_rate)
        self.discriminator_optimizer = torch.optim.Adam(
            self.discriminator.parameters(),
            lr=self.discriminator_learning_rate)
        self.disc_loss_func = nn.BCELoss()
        self.weight_reward = None
        self.weight_custom_reward = None
Beispiel #7
0
    def __init__(self, envs, testing_envs, seed, variance_limit = 0.25):
        self.seed = seed
        self.successes = []
        self.testing_envs = testing_envs
        self.envs = envs
        self.variance_limit = variance_limit
        
        training_envs_per_dof = int(len(self.envs.envs)/3)
        
        self.training_env_seq = [4]*training_envs_per_dof + [5]*training_envs_per_dof + [6]*training_envs_per_dof
        self.testing_env_seq = [4]*10 + [5]*10 + [6]*10

        if p.mode == "retrain":
            self.training_env_seq = self.testing_env_seq

        self.device = torch.device(p.device)
        # create the network
        self.actor = Actor().to(self.device)
        self.critic = Critic().to(self.device)

        if p.mode == 'retrain':
            self.actor.load_state_dict(torch.load("actor_seed_{}".format(seed)))
            self.critic.load_state_dict(torch.load("critic_seed_{}".format(seed)))

        # build up the target network
        self.actor_target = Actor().to(self.device)
        self.critic_target = Critic().to(self.device)
        # load the weights into the target networks
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.critic_target.load_state_dict(self.critic.state_dict())
        # if use gpu
        self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=p.lr)
        self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=p.lr)
        # her sampler
        self.buffer = replay_buffer(seed)

        if p.mode == 'retrain':
            self.buffer.load_normalizers()
            print("loading done")

        self.training_data, self.testing_data = {}, {}
        for env in self.envs.envs:
            self.training_data[env.name] = []
        for env in self.testing_envs.envs:
            self.testing_data[env.name] = []

        try:
            os.mkdir("Generated_data")
        except FileExistsError:
            pass
    def __init__(self, args, env, env_params, image=True):
        self.args = args
        self.env = env
        self.env_params = env_params
        self.image = image

        # create the network
        if self.image:
            self.actor_network = actor_image(env_params, env_params['obs'])
            self.critic_network = critic_image(env_params, env_params['obs'] + env_params['action'])
        else:
            self.actor_network = actor(env_params, env_params['obs'])
            self.critic_network = critic(env_params, env_params['obs'] + env_params['action'])

        # load model if load_path is not None
        if self.args.load_dir != '':
            actor_load_path = self.args.load_dir + '/actor.pt'
            model = torch.load(actor_load_path)
            self.actor_network.load_state_dict(model)
            critic_load_path = self.args.load_dir + '/critic.pt'
            model = torch.load(critic_load_path)
            self.critic_network.load_state_dict(model)

        # sync the networks across the cpus
        # sync_networks(self.actor_network)
        # sync_networks(self.critic_network)
        # build up the target network
        # if self.image:
        #     self.actor_target_network = actor_image(env_params, env_params['obs'])
        # else:
        #     self.actor_target_network = actor(env_params, env_params['obs'])
        # # load the weights into the target networks
        # self.actor_target_network.load_state_dict(self.actor_network.state_dict())

        # if use gpu
        if self.args.cuda:
            self.actor_network.cuda()
            # self.actor_target_network.cuda()
            self.critic_network.cuda()
        # create the optimizer
        self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor)
        self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic)
        # her sampler
        self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env().compute_reward)
        # create the replay buffer
        self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions, image=self.image)

        # path to save the model
        self.model_path = os.path.join(self.args.save_dir, self.args.env_name)
Beispiel #9
0
    def __init__(self,
                 alpha=0.0003,
                 beta=0.0003,
                 inp_dims=[8],
                 env=None,
                 gamma=0.99,
                 n_actions=2,
                 max_size=1000000,
                 tau=0.005,
                 l1_size=256,
                 l2_size=256,
                 batch_size=256,
                 reward_scale=2):

        self.tau = tau
        self.gamma = gamma

        self.mem = replay_buffer(max_size, inp_dims, n_actions)

        self.batch_size = batch_size
        self.n_actions = n_actions

        self.actor = actor(alpha,
                           inp_dims,
                           n_actions=n_actions,
                           name="Actor",
                           max_act=env.action_space.high)

        self.critic1 = critic(beta,
                              inp_dims,
                              n_actions=n_actions,
                              name="Critic1")
        self.critic2 = critic(beta,
                              inp_dims,
                              n_actions=n_actions,
                              name="Critic2")

        self.value = value(beta, inp_dims, name="Value")
        self.target_val = value(beta, inp_dims, name="Target_value")

        self.scale = reward_scale

        self.update_net_params(tau=1)
Beispiel #10
0
    def __init__(self, n_actions, path = None):
        self.dqn = DQN(n_actions, N_ATOMS)
        self.dqn.to(device)

        # loading the network
        if path is not None:
            self.dqn.load_state_dict(torch.load(path))

        self.dqn_target = DQN(n_actions, N_ATOMS)
        self.dqn_target.to(device)
        self.dqn_target.load_state_dict(self.dqn.state_dict())

        self.optimizer = optim.Adam(self.dqn.parameters(), lr = LEARNING_RATE, eps = ADAM_EPS)
        self.buf = replay_buffer(REPLAY_BUFFER_SIZE, N_STEPS)

        # the distribution for terminal states
        self.dist_zero = torch.zeros(N_ATOMS, device = device)
        self.dist_zero[N_ATOMS // 2] = 1.0

        self.atoms = torch.linspace(V_MIN, V_MAX, steps = N_ATOMS, device = device)
        self.delta_z = (V_MAX - V_MIN)/(N_ATOMS - 1)
Beispiel #11
0
    def __init__(self, env, episode, exploration, update_freq, freeze_interval, batch_size, capacity, learning_rate, option_num, gamma, termination_reg, epsilon_init, decay, epsilon_min, entropy_weight, conv, cuda, render, save_path=None):
        self.env = env
        self.episode = episode
        self.exploration = exploration
        self.update_freq = update_freq
        self.freeze_interval = freeze_interval
        self.batch_size = batch_size
        self.capacity = capacity
        self.learning_rate = learning_rate
        self.option_num = option_num
        self.gamma = gamma
        self.termination_reg = termination_reg
        self.epsilon_init = epsilon_init
        self.decay = decay
        self.epsilon_min = epsilon_min
        self.entropy_weight = entropy_weight
        self.conv = conv
        self.cuda = cuda
        self.render = render
        self.save_path = save_path

        if not self.conv:
            self.observation_dim = self.env.observation_space.shape[0]
        else:
            self.observation_dim = self.env.observation_space.shape
        self.action_dim = self.env.action_space.n
        self.epsilon = lambda x: self.epsilon_min + (self.epsilon_init - self.epsilon_min) * math.exp(- x / self.decay)
        self.net = opt_cri_arch(self.observation_dim, self.action_dim, self.option_num, self.conv)
        self.prime_net = opt_cri_arch(self.observation_dim, self.action_dim, self.option_num, self.conv)
        if self.cuda:
            self.net = self.net.cuda()
            self.prime_net = self.prime_net.cuda()
        self.prime_net.load_state_dict(self.net.state_dict())
        self.optimizer = torch.optim.Adam(self.net.parameters(), lr=self.learning_rate)
        self.buffer = replay_buffer(self.capacity)
        self.count = 0
        self.weight_reward = None
Beispiel #12
0
 def __init__(self, env, seed):
     self.seed = seed
     self.successes = []
     self.epochs = []
     self.env = env
     self.device = torch.device(p.device)
     # create the network
     self.actor = Actor(self.env.ob_shape, self.env.goal_shape,
                        self.env.action_shape).to(self.device)
     self.critic = Critic(self.env.ob_shape, self.env.goal_shape,
                          self.env.action_shape).to(self.device)
     # build up the target network
     self.actor_target = Actor(self.env.ob_shape, self.env.goal_shape,
                               self.env.action_shape).to(self.device)
     self.critic_target = Critic(self.env.ob_shape, self.env.goal_shape,
                                 self.env.action_shape).to(self.device)
     # load the weights into the target networks
     self.actor_target.load_state_dict(self.actor.state_dict())
     self.critic_target.load_state_dict(self.critic.state_dict())
     # if use gpu
     self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=p.lr)
     self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=p.lr)
     # her sampler
     self.buffer = replay_buffer(self.env.ob_shape, self.env.action_shape)
 def __init__(self, args, env, env_params):
     self.savetime = 0
     self.args = args
     self.env = env
     self.env_params = env_params
     # create the network
     self.actor_network = actor(env_params)
     self.critic_network = critic(env_params)
     # sync the networks across the cpus
     sync_networks(self.actor_network)
     sync_networks(self.critic_network)
     # build up the target network
     self.actor_target_network = actor(env_params)
     self.critic_target_network = critic(env_params)
     # load the weights into the target networks
     self.actor_target_network.load_state_dict(
         self.actor_network.state_dict())
     self.critic_target_network.load_state_dict(
         self.critic_network.state_dict())
     # if use gpu
     if self.args.cuda:
         self.actor_network.cuda()
         self.critic_network.cuda()
         self.actor_target_network.cuda()
         self.critic_target_network.cuda()
     # create the optimizer
     self.actor_optim = torch.optim.Adam(self.actor_network.parameters(),
                                         lr=self.args.lr_actor)
     self.critic_optim = torch.optim.Adam(self.critic_network.parameters(),
                                          lr=self.args.lr_critic)
     # her sampler
     self.her_module = her_sampler(self.args.replay_strategy,
                                   self.args.replay_k,
                                   self.env.compute_reward)
     # create the replay buffer
     self.buffer = replay_buffer(self.env_params, self.args.buffer_size,
                                 self.her_module.sample_her_transitions)
     # 是否加入示教数据
     if self.args.add_demo:
         self._init_demo_buffer(
         )  # initialize replay buffer with demonstration
     # create the normalizer
     self.o_norm = normalizer(size=env_params['obs'],
                              default_clip_range=self.args.clip_range)
     self.g_norm = normalizer(size=env_params['goal'],
                              default_clip_range=self.args.clip_range)
     # load the data to continue the training
     # model_path = "saved_models/bmirobot-v3/125_True12_model.pt"
     # # # model_path = args.save_dir + args.env_name + '/' + str(args.seed) + '_' + str(args.add_demo) + '_model.pt'
     # # o_mean, o_std, g_mean, g_std, model = torch.load(model_path, map_location=lambda storage, loc: storage)
     # self.actor_network.load_state_dict(model)
     # self.o_norm.mean=o_mean
     # self.o_norm.std=o_std
     # self.g_norm.mean=g_mean
     # self.g_norm.std=g_std
     self.success_rates = []  # 记录每个epoch的成功率
     # create the dict for store the model
     if MPI.COMM_WORLD.Get_rank() == 0:
         if not os.path.exists(self.args.save_dir):
             os.mkdir(self.args.save_dir)
         # path to save the model
         self.model_path = os.path.join(self.args.save_dir,
                                        self.args.env_name)
         if not os.path.exists(self.model_path):
             os.mkdir(self.model_path)
    def __init__(self,
                 args,
                 envs_lst,
                 env_params,
                 expert_lst_dir,
                 recurrent=True,
                 ee_reward=True,
                 image=True):
        self.args = args
        self.envs_lst = envs_lst
        self.env_params = env_params
        self.recurrent = recurrent
        self.ee_reward = ee_reward
        self.image = image

        # initialize expert
        self.expert_lst = []
        for dir in expert_lst_dir:
            expert_load_path = dir + '/model.pt'
            o_mean, o_std, g_mean, g_std, model = torch.load(expert_load_path)
            expert_model = actor(env_params,
                                 env_params['obs'] + env_params['goal'])
            expert_model.load_state_dict(model)
            self.expert_lst.append({
                "model": expert_model,
                "o_mean": o_mean,
                "o_std": o_std,
                "g_mean": g_mean,
                "g_std": g_std
            })

        # create the network
        if self.recurrent:
            self.actor_network = actor_recurrent(
                env_params,
                env_params['obs'] + env_params['goal'] + env_params['action'],
                env_params['goal'])
            # self.critic_network = critic_recurrent(env_params, env_params['obs'] + env_params['goal'] + 2 * env_params['action'])
        else:
            self.actor_network = actor(
                env_params,
                env_params['obs'] + env_params['goal'] + env_params['action'],
                env_params['goal'])
        self.critic_network = critic(
            env_params,
            env_params['obs'] + 2 * env_params['goal'] + env_params['action'])

        # create the normalizer
        self.o_norm = normalizer(size=env_params['obs'],
                                 default_clip_range=self.args.clip_range)
        self.g_norm = normalizer(size=env_params['goal'],
                                 default_clip_range=self.args.clip_range)
        self.sg_norm = normalizer(size=env_params['action'],
                                  default_clip_range=self.args.clip_range)

        # load model if load_path is not None
        if self.args.load_dir != '':
            load_path = self.args.load_dir + '/model.pt'
            # o_mean, o_std, g_mean, g_std, sg_mean, sg_std, model = torch.load(load_path)
            o_mean, o_std, g_mean, g_std, model = torch.load(load_path)
            self.o_norm.mean = o_mean
            self.o_norm.std = o_std
            self.g_norm.mean = g_mean
            self.g_norm.std = g_std
            # self.sg_norm.mean = sg_mean
            # self.sg_norm.std = sg_std
            self.actor_network.load_state_dict(model)

        # sync the networks across the cpus
        sync_networks(self.actor_network)
        sync_networks(self.critic_network)
        # build up the target network
        if self.recurrent:
            self.actor_target_network = actor_recurrent(
                env_params,
                env_params['obs'] + env_params['goal'] + env_params['action'],
                env_params['goal'])
            # self.critic_target_network = critic_recurrent(env_params, env_params['obs'] + env_params['goal'] + 2 * env_params['action'])
        else:
            self.actor_target_network = actor(
                env_params,
                env_params['obs'] + env_params['goal'] + env_params['action'],
                env_params['goal'])
        self.critic_target_network = critic(
            env_params,
            env_params['obs'] + 2 * env_params['goal'] + env_params['action'])
        # load the weights into the target networks
        self.actor_target_network.load_state_dict(
            self.actor_network.state_dict())
        self.critic_target_network.load_state_dict(
            self.critic_network.state_dict())

        # if use gpu
        if self.args.cuda:
            self.actor_network.cuda()
            self.critic_network.cuda()
            self.actor_target_network.cuda()
            self.critic_target_network.cuda()
        # create the optimizer
        self.actor_optim = torch.optim.Adam(self.actor_network.parameters(),
                                            lr=self.args.lr_actor)
        self.critic_optim = torch.optim.Adam(self.critic_network.parameters(),
                                             lr=self.args.lr_critic)
        # her sampler
        self.her_module_lst = [
            her_sampler(self.args.replay_strategy, self.args.replay_k,
                        env.compute_reward) for env in self.envs_lst
        ]
        # create the replay buffer
        self.buffer_lst = [
            replay_buffer(self.env_params,
                          self.args.buffer_size,
                          her_module.sample_her_transitions,
                          ee_reward=True) for her_module in self.her_module_lst
        ]

        # path to save the model
        self.model_path = os.path.join(self.args.save_dir, self.args.env_name)
Beispiel #15
0
EVAL_EPISODE = 10  # Perform evaluation each EVAL_EPISODE number of episodes
NUM_STEPS = 200
NUM_EVALUATIONS = 10  # Number of times we run evaluation
INITIAL_EPSILON = 0.9
MIN_EPSILON = 0.1
EPSILON_DECAY = 0.999
BUFFER_SIZE = 100000

env = gym.make('CartPole-v0')

ndim_action = env.action_space.n
ndim_obs = env.observation_space.shape[0]
epsilon = INITIAL_EPSILON

agent = dqn(ndim_obs, ndim_action)
replay = replay_buffer(BUFFER_SIZE)

for i in range(NUM_EPISODES):
    # Training
    obs = env.reset()
    for j in range(NUM_STEPS):
        # Epsilon greedy exploration
        if np.random.uniform() < epsilon:
            action = env.action_space.sample()
        else:
            action = agent.act(np.expand_dims(obs, 0))[0]

        # Perform an action, observe next state and reward
        newobs, reward, done, info = env.step(action)

        # Insert it to replay buffer
Beispiel #16
0
    def __init__(self, args, env, env_params):
        self.args = args
        self.env = env
        self.env_params = env_params

        # create the network
        self.actor_network = actor(env_params)
        self.critic_network = critic(env_params)
        # sync the networks across the cpus
        #sync_networks(self.actor_network)
        #sync_networks(self.critic_network)
        # build up the target network
        self.actor_target_network = actor(env_params)
        self.critic_target_network = critic(env_params)

        # Load the model if required
        if args.load_path != None:
            o_mean, o_std, g_mean, g_std, load_actor_model, load_critic_model = torch.load(
                args.load_path, map_location=lambda storage, loc: storage)
            self.actor_network.load_state_dict(load_actor_model)
            self.critic_network.load_state_dict(load_critic_model)

        # load the weights into the target networks
        self.actor_target_network.load_state_dict(
            self.actor_network.state_dict())
        self.critic_target_network.load_state_dict(
            self.critic_network.state_dict())
        # if use gpu
        if self.args.cuda:
            self.actor_network.cuda()
            self.critic_network.cuda()
            self.actor_target_network.cuda()
            self.critic_target_network.cuda()
        # create the optimizer
        self.actor_optim = torch.optim.Adam(self.actor_network.parameters(),
                                            lr=self.args.lr_actor)
        self.critic_optim = torch.optim.Adam(self.critic_network.parameters(),
                                             lr=self.args.lr_critic)
        # her sampler
        self.her_module = her_sampler(self.args.replay_strategy,
                                      self.args.replay_k,
                                      self.env.compute_reward)
        # create the replay buffer
        self.buffer = replay_buffer(self.env_params, self.args.buffer_size,
                                    self.her_module.sample_her_transitions)
        # create the normalizer
        self.o_norm = normalizer(size=env_params['obs'],
                                 default_clip_range=self.args.clip_range)
        self.g_norm = normalizer(size=env_params['goal'],
                                 default_clip_range=self.args.clip_range)
        # create the dict for store the model
        #if MPI.COMM_WORLD.Get_rank() == 0:
        if not os.path.exists(self.args.save_dir):
            os.mkdir(self.args.save_dir)

        # makeup a suffix for the model path to indicate which method is used for Training
        #self.folder_siffix = '_' + self.args.replay_strategy + '_' + self.args.env_params.reward_type
        # path to save the model
        self.model_path = os.path.join(self.args.save_dir, self.args.env_name)
        if not os.path.exists(self.model_path):
            os.mkdir(self.model_path)
        self.model_path = os.path.join(self.model_path,
                                       'seed_' + str(self.args.seed))
        if not os.path.exists(self.model_path):
            os.mkdir(self.model_path)
Beispiel #17
0
    def __init__(self,
                 alpha,
                 beta,
                 inp_dims,
                 tau,
                 env,
                 gamma=0.99,
                 update_actor_int=2,
                 warmup=1000,
                 n_actions=2,
                 max_size=1000000,
                 l1_size=400,
                 l2_size=300,
                 batch_size=100,
                 noise=0.1):

        self.tau = tau
        self.gamma = gamma
        self.min_act = env.action_space.low
        self.max_act = env.action_space.high
        self.mem = replay_buffer(max_size, inp_dims, n_actions)

        self.batch_size = batch_size
        self.lrn_step_count = 0
        self.warmup = warmup
        self.n_actions = n_actions
        self.time_step = 0

        self.update_act_iter = update_actor_int

        self.actor = actor(alpha,
                           inp_dims,
                           l1_size,
                           l2_size,
                           n_actions=n_actions,
                           name="Actor")

        self.critic1 = critic(beta,
                              inp_dims,
                              l1_size,
                              l2_size,
                              n_actions=n_actions,
                              name="Critic1")
        self.critic2 = critic(beta,
                              inp_dims,
                              l1_size,
                              l2_size,
                              n_actions=n_actions,
                              name="Critic2")

        self.actor_target = actor(alpha,
                                  inp_dims,
                                  l1_size,
                                  l2_size,
                                  n_actions=n_actions,
                                  name="Actor_target")
        self.critic_target1 = critic(beta,
                                     inp_dims,
                                     l1_size,
                                     l2_size,
                                     n_actions=n_actions,
                                     name="Critic_target1")
        self.critic_target2 = critic(beta,
                                     inp_dims,
                                     l1_size,
                                     l2_size,
                                     n_actions=n_actions,
                                     name="Critic_target2")

        self.noise = noise
        self.update_net_params(tau=1)
BATCH_SIZE = 32
INITIAL_SIZE = 50000
CAPACITY_SIZE = 1000000
GAMMA = 0.99
ALPHA = 0.0001
C = 10000
NUM_SKIP = 4
N_ACTIONS = env.action_space.n
STATE_DIM = env.observation_space.shape[0]
HEIGHT = 28
WIDTH = 28

USE_GPU = True

# Initialize experience replay buffer
buffer = replay_buffer(CAPACITY_SIZE)

if USE_GPU:
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
else:
    device = torch.device("cpu")

print(device)

Q = DQN(state_dim=STATE_DIM,
        num_action=N_ACTIONS,
        alpha=ALPHA,
        C=C,
        learning_start=INITIAL_SIZE,
        learningfreq=NUM_SKIP,
        height=HEIGHT,
Beispiel #19
0
    def __init__(self, args, env, env_params):
        self.args = args
        self.env = env
        self.env_params = env_params

        # create the network
        self.actor_network = actor(env_params)
        self.critic_network = critic(env_params)
        # build up the target network
        self.actor_target_network = actor(env_params)
        self.critic_target_network = critic(env_params)

        # Load the model if required
        if args.load_path != None:
            o_mean, o_std, g_mean, g_std, load_actor_model, load_critic_model = torch.load(
                args.load_path, map_location=lambda storage, loc: storage)
            self.actor_network.load_state_dict(load_actor_model)
            self.critic_network.load_state_dict(load_critic_model)

        # load the weights into the target networks
        self.actor_target_network.load_state_dict(
            self.actor_network.state_dict())
        self.critic_target_network.load_state_dict(
            self.critic_network.state_dict())
        # if use gpu
        if self.args.cuda:
            self.actor_network.cuda()
            self.critic_network.cuda()
            self.actor_target_network.cuda()
            self.critic_target_network.cuda()
        # create the optimizer
        self.actor_optim = torch.optim.Adam(self.actor_network.parameters(),
                                            lr=self.args.lr_actor)
        self.critic_optim = torch.optim.Adam(self.critic_network.parameters(),
                                             lr=self.args.lr_critic)
        # her sampler
        self.her_module = her_sampler(self.args.replay_strategy,
                                      self.args.replay_k,
                                      self.env.compute_reward)
        # create the replay buffer
        if self.args.replay_strategy == 'future':
            self.buffer = replay_buffer(self.env_params, self.args.buffer_size,
                                        self.her_module.sample_her_transitions)
        else:
            self.buffer = replay_buffer(
                self.env_params, self.args.buffer_size,
                self.her_module.sample_normal_transitions)
        # create the normalizer
        self.o_norm = normalizer(size=env_params['obs'],
                                 default_clip_range=self.args.clip_range)
        self.g_norm = normalizer(size=env_params['goal'],
                                 default_clip_range=self.args.clip_range)
        # create the dict for store the model
        if not os.path.exists(self.args.save_dir):
            os.mkdir(self.args.save_dir)

        # makeup a suffix for the model path to indicate which method is used for Training
        buffer_len_epochs = int(
            self.args.buffer_size /
            (env_params['max_timesteps'] * self.args.num_rollouts_per_cycle *
             self.args.n_cycles))
        name_add_on = ''
        if self.args.exploration_strategy == 'pgg':
            if self.args.pgg_strategy == 'final':
                if self.args.replay_strategy == 'future':
                    name_add_on = '_final_distance_based_goal_generation_buffer' + str(
                        buffer_len_epochs) + 'epochs'
                else:
                    name_add_on = '_final_distance_based_goal_generation_withoutHER_buffer' + str(
                        buffer_len_epochs) + 'epochs'
            else:
                if self.args.replay_strategy == 'future':
                    name_add_on = '_distance_based_goal_generation_buffer' + str(
                        buffer_len_epochs) + 'epochs'
                else:
                    name_add_on = '_distance_based_goal_generation_withoutHER_buffer' + str(
                        buffer_len_epochs) + 'epochs'
        else:
            if self.args.replay_strategy == 'future':
                name_add_on = '_originalHER_buffer' + str(
                    buffer_len_epochs) + 'epochs'
            else:
                name_add_on = '_originalDDPG_buffer' + str(
                    buffer_len_epochs) + 'epochs'

        # path to save the model
        self.model_path = os.path.join(self.args.save_dir,
                                       self.args.env_name + name_add_on)

        if not os.path.exists(self.model_path):
            os.mkdir(self.model_path)
        self.model_path = os.path.join(self.model_path,
                                       'seed_' + str(self.args.seed))
        if not os.path.exists(self.model_path):
            os.mkdir(self.model_path)
Beispiel #20
0
    capacity = 10000
    exploration = 200
    epsilon_init = 0.99
    epsilon_min = 0.2
    decay = 0.998
    episode = 1000000
    render = False
    threshold_reward = 80

    env_list = [reverse_observation_wrapper(gym.make('CartPole-v0')), gym.make('CartPole-v0'), reverse_action_wrapper(reverse_observation_wrapper(gym.make('CartPole-v0'))), reverse_action_wrapper(gym.make('CartPole-v0'))]
    observation_dim = env_list[0].observation_space.shape[0]
    action_dim = env_list[0].action_space.n
    target_net = Progressive_neural_net(3)
    eval_net = Progressive_neural_net(3)
    eval_net.load_state_dict(target_net.state_dict())
    buffer = replay_buffer(capacity)
    loss_fn = nn.MSELoss()
    sizes = [observation_dim, 64, 32, action_dim]

    for env_idx, env in enumerate(env_list):
        count = 0
        epsilon = epsilon_init
        eval_net.add_new_column(sizes)
        target_net.add_new_column(sizes)
        target_net.load_state_dict(eval_net.state_dict())
        optimizer = torch.optim.Adam(eval_net.parameters(env_idx), lr=learning_rate)
        weight_reward = None
        for i in range(episode):
            obs = env.reset()
            if epsilon > epsilon_min:
                epsilon = epsilon * decay