Esempio n. 1
0
    def __init__(self, gamma, epsilon, lr, n_actions=, input_dims,
                 mem_size, batch_size, eps_min=0.01, eps_dec=5e-7,
                 replace=1000, chkpt_dir='tmp/dueling_ddqn'):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.replace_target_cnt = replace
        self.chkpt_dir = chkpt_dir
        self.action_space = [i for i in range(self.n_actions)]
        self.learn_step_counter = 0

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)

        self.q_eval = Network(self.lr, self.n_actions,
                                   input_dims=self.input_dims,
                                   name='lunar_lander_dueling_ddqn_q_eval',
                                   chkpt_dir=self.chkpt_dir)

        self.q_next = Network(self.lr, self.n_actions,
                                   input_dims=self.input_dims,
                                   name='lunar_lander_dueling_ddqn_q_next',
                                   chkpt_dir=self.chkpt_dir)
    def train_convolutional_part(self, env, n_frames, print_state_every=100):
        self.current_model.mode_enc_dec = True
        # Take a random action
        action = self.current_model.act(state=None, epsilon=1.)
        state = env.reset()
        states_buffer = ReplayBuffer(capacity=1000)
        losses = []
        for i in range(n_frames):
            next_state, reward, done, _ = env.step(action)
            states_buffer.push(state, action, reward, next_state, done)

            if n_frames % 4 == 0:
                action = self.current_model.act(state=None, epsilon=1.)

            if done:
                print("Episode done during Encoder Decoder Training")
                state = env.reset()
            if len(states_buffer) > self.batch_size:
                # Train
                loss = self.compute_conv_loss(
                    states_buffer.state_sample(batch_size=self.batch_size))
                # Save the loss
                losses.append(loss.item())
            if i % print_state_every == 0 and len(losses) > 1:
                print("Training Encoder Decoder. Step:" + str(i) + "/" +
                      str(n_frames) + ". "
                      "Mean Loss: " +
                      str(np.round(np.mean(losses[-10:]), decimals=5)))
        for param in self.current_model.encoder.parameters():
            param.requires_grad = False
        self.current_model.mode_enc_dec = False
        self.update_target()
Esempio n. 3
0
def main():
    policy_net = DQN(U_num, num_actions).to(device)  #初始化Q网络
    policy_net.apply(init_weights)
    if pretrained:
        ckp = torch.load('/data2/jiangjigang/ckp/dqn.pth')
        policy_net.load_state_dict(
            {k.replace('module.', ''): v
             for k, v in ckp.items()})
    target_net = DQN(U_num, num_actions).to(device)  #初始化target_Q网络
    target_net.load_state_dict(policy_net.state_dict())  #用Q网络的参数初始化target_Q网络
    target_net.eval()
    optimizer_policy = torch.optim.Adam(policy_net.parameters(),
                                        lr=learning_rate)  #定义优化器Adam,可以更换
    buffer = ReplayBuffer(
        buffer_size
    )  #定义一个经验池  PS:经验池储存经验数据,后随机从经验池中抽取经验数据来训练更新网络参数 在Buffer.py中
    criterion = torch.nn.MSELoss(reduction='sum')

    # training
    for i_episode in range(num_episodes):

        state0 = [user_loc, user_dis, node_loc, use_buff]  #获得一个初始化状态
        error = 0.0
        all_reward = 0
        for t in count():
            # 选择动作
            action = e_greedy_select_action(state0, policy_net)
            a = np.array([action.data.cpu().numpy()])
            #print("action selected by e_greedy is {}".format(action))
            # 利用状态转移函数,得到当前状态下采取当前行为得到的下一个状态,和下一个状态的终止情况
            state1, done, flag = transition_function(state0, action)
            # 利用奖励函数,获得当前的奖励值
            reward, cost_migration = reward_function(state0, action, state1,
                                                     flag)
            all_reward = all_reward + reward
            # 将经验数据存储至buffer中
            buffer.add(state0, a, reward, state1, done)

            # exit an episode after MAX_T steps
            if t > MAX_T:
                break

            #当episode>10时进行网络参数更新,目的是为了让经验池中有较多的数据,使得训练较为稳定。
            if i_episode > 1:

                # 从buffer中取出一批训练样本,训练数据batch由BATCH_SIZE参数决定
                batch = buffer.getBatch(BATCH_SIZE)

                policy_net, target_net, bellman_error = optimize_model(
                    batch, policy_net, target_net, optimizer_policy, criterion)
                error = error + bellman_error.data.cpu().numpy()
            # 进入下一状态
            state0 = state1
        ave_error = error / (t * 1.00)
        ave_reward = all_reward / (t * 1.00)
        print(ave_error, ave_reward)
    torch.save(policy_net.state_dict(), '/data2/jiangjigang/ckp/dqn.pth')
    def __init__(self,
                 input_size,
                 num_actions,
                 gamma=DEFAULT_GAMMA,
                 buffer_size=DEFAULT_BUFFER_SIZE,
                 batch_size=DEFAULT_BATCH_SIZE,
                 load_from_path=None,
                 prepare_conv=False):
        """
        Include the double network and is in charge of train and manage it
        :param input_size:
        :param num_actions:
        :param buffer_size: int. Size of the replay buffer
        :param batch_size: int. Size of the Batch
        """
        # Instantiate both models
        net = Raimbow if len(input_size) == 3 else DQN
        self.current_model = net(input_size=input_size,
                                 num_actions=num_actions,
                                 prepare_decoder=prepare_conv)
        if load_from_path is not None:
            self.load_weights(path=load_from_path)
        self.target_model = net(input_size=input_size,
                                num_actions=num_actions,
                                prepare_decoder=prepare_conv)

        # Put them into the GPU if available
        if USE_CUDA:
            self.current_model = self.current_model.cuda()
            self.target_model = self.target_model.cuda()

        # Initialize the Adam optimizer and the replay buffer
        self.optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                           self.current_model.parameters()),
                                    lr=0.00001)
        self.replay_buffer = ReplayBuffer(capacity=buffer_size)

        # Make both networks start with the same weights
        self.update_target()

        # Save the rest of parameters
        self.batch_size = batch_size
        self.gamma = gamma
        self.input_channels = input_size
Esempio n. 5
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Reward monitoring
        self.best_total_reward = -np.inf

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters
Esempio n. 6
0
    def __init__(self, env, config):
        """Initialize an Agent object.
        
        Params
        ======
            env : environment to be handled
            config : configuration given a variety of parameters
        """

        self.env = env
        self.config = config
        # self.seed = (config['seed'])

        # set parameter for ML
        self.set_parameters(config)
        # Replay memory
        self.memory = ReplayBuffer(config)
        # Q-Network
        self.create_agents(config)
        # load agent
        if self.load_model:
            self.load_agent('trained_tennis_2k86.pth')
Esempio n. 7
0
import gym
import world
import utils
from Buffer import ReplayBuffer
from models import DQN
from world import Print, ARGS
from wrapper import WrapIt
from procedure import train_DQN

# ------------------------------------------------
env = gym.make('RiverraidNoFrameskip-v4')
env = WrapIt(env)
Print('ENV action', env.unwrapped.get_action_meanings())
Print('ENV observation', f"Image: {ARGS.imgDIM} X {ARGS.imgDIM} X {1}"
      )  # we assert to use gray image
# ------------------------------------------------
Optimizer = utils.getOptimizer()
schedule = utils.LinearSchedule(1000000, 0.1)

Game_buffer = ReplayBuffer(ARGS.buffersize, ARGS.framelen)

Q = utils.init_model(env, DQN).train().to(world.DEVICE)
Q_target = utils.init_model(env, DQN).eval().to(world.DEVICE)
# ------------------------------------------------
train_DQN(env,
          Q=Q,
          Q_target=Q_target,
          optimizer=Optimizer,
          replay_buffer=Game_buffer,
          exploration=schedule)
Esempio n. 8
0
user_loc = np.random.randint(0, 101, U_num).tolist()  #用户位置 1-100号
user_dis = random_displacement(user_loc)  #用户未来位移 上下左右 -10,10,-1,1
use_buff = np.random.randint(3, 8, U_num).tolist()  #资源所需
state0 = [user_loc, user_dis, node_loc, use_buff]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#主程序部分

policy_net = DQN(U_num, num_actions).to(device)  #初始化Q网络
target_net = DQN(U_num, num_actions).to(device)  #初始化target_Q网络
target_net.load_state_dict(policy_net.state_dict())  #用Q网络的参数初始化target_Q网络
target_net.eval()
optimizer_policy = torch.optim.Adam(policy_net.parameters(),
                                    lr=learning_rate)  #定义优化器Adam,可以更换
buffer = ReplayBuffer(
    buffer_size)  #定义一个经验池  PS:经验池储存经验数据,后随机从经验池中抽取经验数据来训练更新网络参数 在Buffer.py中
criterion = torch.nn.MSELoss(reduction='sum')

# training
for i_episode in range(num_episodes):

    #state0 #获得一个初始化状态

    for t in count():
        # 选择动作
        action = e_greedy_select_action(state0)
        print("action selected by e_greedy is {}".format(action))
        # 利用状态转移函数,得到当前状态下采取当前行为得到的下一个状态,和下一个状态的终止情况
        state1, done, flag = transition_function(state0, action)
        # 利用奖励函数,获得当前的奖励值
        reward, cost_migration = reward_function(state0, action, state1, flag)