Beispiel #1
0
    autoencoder = Autoencoder(args.enc_hidden_dim, args.dec_hidden_dim,
                              args.embedding_dim, args.latent_dim,
                              vocab.size(), args.dropout, args.seq_len)
    autoencoder.load_state_dict(
        torch.load('autoencoder.th', map_location=lambda x, y: x))
    generator = Generator(args.n_layers, args.block_dim)
    critic = Critic(args.n_layers, args.block_dim)

    g_optimizer = optim.Adam(generator.parameters(), lr=args.lr)
    c_optimizer = optim.Adam(critic.parameters(), lr=args.lr)

    if args.cuda:
        autoencoder = autoencoder.cuda()
        generator = generator.cuda()
        critic = critic.cuda()

    print('G Parameters:', sum([p.numel() for p in generator.parameters() if \
                                p.requires_grad]))
    print('C Parameters:', sum([p.numel() for p in critic.parameters() if \
                                p.requires_grad]))

    best_loss = np.inf

    for epoch in range(1, args.epochs + 1):
        g_loss, c_loss = train(epoch)
        loss = g_loss + c_loss
        if loss < best_loss:
            best_loss = loss
            print('* Saved')
            torch.save(generator.state_dict(), 'generator.th')
Beispiel #2
0
class AgentDDPG:
    """Deep Deterministic Policy Gradient implementation for continuous action space reinforcement learning tasks"""
    def __init__(self,
                 state_size,
                 hidden_size,
                 action_size,
                 actor_learning_rate=1e-4,
                 critic_learning_rate=1e-3,
                 gamma=0.99,
                 tau=1e-2,
                 use_cuda=False,
                 actor_path=None,
                 critic_path=None):
        # Params
        self.state_size, self.hidden_size, self.action_size = state_size, hidden_size, action_size
        self.gamma, self.tau = gamma, tau
        self.use_cuda = use_cuda

        # Networks
        self.actor = Actor(state_size, hidden_size, action_size)
        self.actor_target = Actor(state_size, hidden_size, action_size)

        self.critic = Critic(state_size + action_size, hidden_size,
                             action_size)
        self.critic_target = Critic(state_size + action_size, hidden_size,
                                    action_size)

        # Load model state_dicts from saved file
        if actor_path and path.exists(actor_path):
            self.actor.load_state_dict(torch.load(actor_path))

        if critic_path and path.exists(critic_path):
            self.critic.load_state_dict(torch.load(critic_path))

        # Hard copy params from original networks to target networks
        copy_params(self.actor, self.actor_target)
        copy_params(self.critic, self.critic_target)

        if self.use_cuda:
            self.actor.cuda()
            self.actor_target.cuda()
            self.critic.cuda()
            self.critic_target.cuda()

        # Create replay buffer for storing experience
        self.replay_buffer = ReplayBuffer(cache_size=int(1e6))

        # Training
        self.critic_criterion = nn.MSELoss()
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=actor_learning_rate)
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=critic_learning_rate)

    def save_to_file(self, actor_file, critic_file):
        # Save the state_dict's of the Actor and Critic networks
        torch.save(self.actor.state_dict(), actor_file)
        torch.save(self.critic.state_dict(), critic_file)

    def get_action(self, state):
        """Select action with respect to state according to current policy and exploration noise"""
        state = Variable(torch.from_numpy(state).float())

        if self.use_cuda:
            state = state.cuda()

        a = self.actor.forward(state)

        if self.use_cuda:
            return a.detach().cpu().numpy()

        return a.detach().numpy()

    def save_experience(self, state_t, action_t, reward_t, state_t1):
        self.replay_buffer.add_sample(state_t, action_t, reward_t, state_t1)

    def update(self, batch_size):
        states, actions, rewards, next_states = self.replay_buffer.get_samples(
            batch_size)
        states = torch.FloatTensor(states)
        actions = torch.FloatTensor(actions)
        rewards = torch.FloatTensor(rewards)
        next_states = torch.FloatTensor(next_states)

        if self.use_cuda:
            states = states.cuda()
            next_states = next_states.cuda()
            actions = actions.cuda()
            rewards = rewards.cuda()

        # Critic loss
        Qvals = self.critic.forward(states, actions)
        next_actions = self.actor_target.forward(next_states)
        next_Q = self.critic_target.forward(next_states, next_actions.detach())
        Qprime = rewards + self.gamma * next_Q
        critic_loss = self.critic_criterion(Qvals, Qprime)

        # Update critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Actor loss
        policy_loss = -self.critic.forward(states,
                                           self.actor.forward(states)).mean()

        # Update actor
        self.actor_optimizer.zero_grad()
        policy_loss.backward()
        self.actor_optimizer.step()

        # update target networks
        soft_copy_params(self.actor, self.actor_target, self.tau)
        soft_copy_params(self.critic, self.critic_target, self.tau)

    def add_noise_to_weights(self, amount=0.1):
        self.actor.apply(
            lambda x: _add_noise_to_weights(x, amount, self.use_cuda))
        self.critic.apply(
            lambda x: _add_noise_to_weights(x, amount, self.use_cuda))
        self.actor_target.apply(
            lambda x: _add_noise_to_weights(x, amount, self.use_cuda))
        self.critic_target.apply(
            lambda x: _add_noise_to_weights(x, amount, self.use_cuda))
Beispiel #3
0
class DDPG(object):
    def __init__(self, state_dim, action_dim, max_action, memory, args):

        # actor
        self.actor = Actor(state_dim,
                           action_dim,
                           max_action,
                           layer_norm=args.layer_norm)
        self.actor_target = Actor(state_dim,
                                  action_dim,
                                  max_action,
                                  layer_norm=args.layer_norm)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=args.actor_lr)

        # crtic
        self.critic = Critic(state_dim, action_dim, layer_norm=args.layer_norm)
        self.critic_target = Critic(state_dim,
                                    action_dim,
                                    layer_norm=args.layer_norm)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=args.critic_lr)

        # cuda
        if torch.cuda.is_available():
            self.actor = self.actor.cuda()
            self.actor_target = self.actor_target.cuda()
            self.critic = self.critic.cuda()
            self.critic_target = self.critic_target.cuda()

        # misc
        self.criterion = nn.MSELoss()
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.max_action = max_action
        self.memory = memory

        # hyper-parameters
        self.tau = args.tau
        self.discount = args.discount
        self.batch_size = args.batch_size

    def show_lr(self):
        print(self.actor_optimizer.state_dict())

    def select_action(self, state, noise=None):
        state = FloatTensor(state.reshape(-1, self.state_dim))
        action = self.actor(state).cpu().data.numpy().flatten()

        if noise is not None:
            action += noise.sample()

        return np.clip(action, -self.max_action, self.max_action)

    def train(self, iterations):

        for _ in tqdm(range(iterations)):

            # Sample replay buffer
            x, y, u, r, d = self.memory.sample(self.batch_size)
            state = FloatTensor(x)
            action = FloatTensor(u)
            next_state = FloatTensor(y)
            done = FloatTensor(1 - d)
            reward = FloatTensor(r)

            # Q target = reward + discount * Q(next_state, pi(next_state))
            with torch.no_grad():
                target_Q = self.critic_target(next_state,
                                              self.actor_target(next_state))
                target_Q = reward + (done * self.discount * target_Q)

            # Get current Q estimate
            current_Q = self.critic(state, action)

            # Compute critic loss
            critic_loss = self.criterion(current_Q, target_Q)

            # Optimize the critic
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()

            # Compute actor loss
            actor_loss = -self.critic(state, self.actor(state)).mean()

            # Optimize the actor
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # Update the frozen target models
            for param, target_param in zip(self.critic.parameters(),
                                           self.critic_target.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

            for param, target_param in zip(self.actor.parameters(),
                                           self.actor_target.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

    def train_critic(self, iterations):

        for _ in tqdm(range(iterations)):

            # Sample replay buffer
            states, n_states, actions, rewards, dones = self.memory.sample(
                self.batch_size)

            sys.stdout.flush()

            # Q target = reward + discount * Q(next_state, pi(next_state))
            with torch.no_grad():
                target_Q = self.critic_target(n_states,
                                              self.actor_target(n_states))
                target_Q = rewards + (1 - dones) * self.discount * target_Q

            # Get current Q estimate
            current_Q = self.critic(states, actions)

            # Compute critic loss
            critic_loss = self.criterion(current_Q, target_Q)

            # Optimize the critic
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()

            # Compute actor loss
            actor_loss = - \
                self.critic(states, self.actor(states)).mean()

            # Optimize the actor
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # Update the frozen target models
            for param, target_param in zip(self.critic.parameters(),
                                           self.critic_target.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

            for param, target_param in zip(self.actor.parameters(),
                                           self.actor_target.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

    def load(self, filename):
        self.actor.load_model(filename, "actor")
        self.critic.load_model(filename, "critic")

    def save(self, output):
        self.actor.save_model(output, "actor")
        self.critic.save_model(output, "critic")
Beispiel #4
0
class D3PG(object):
    def __init__(self, state_dim, action_dim, max_action, memory, args):

        # misc
        self.criterion = nn.MSELoss()
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.max_action = max_action
        self.memory = memory
        self.n = args.n_actor

        # actors
        self.actors = [
            Actor(state_dim,
                  action_dim,
                  max_action,
                  layer_norm=args.layer_norm) for i in range(self.n)
        ]
        self.actors_target = [
            Actor(state_dim,
                  action_dim,
                  max_action,
                  layer_norm=args.layer_norm) for i in range(self.n)
        ]
        self.actors_optimizer = [
            torch.optim.Adam(self.actors[i].parameters(), lr=args.actor_lr)
            for i in range(self.n)
        ]

        for i in range(self.n):
            self.actors_target[i].load_state_dict(self.actors[i].state_dict())

        # crtic
        self.critic = Critic(state_dim, action_dim, layer_norm=args.layer_norm)
        self.critic_target = Critic(state_dim,
                                    action_dim,
                                    layer_norm=args.layer_norm)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=args.critic_lr)

        # cuda
        if torch.cuda.is_available():
            for i in range(self.n):
                self.actors[i] = self.actors[i].cuda()
                self.actors_target[i] = self.actors_target[i].cuda()
            self.critic = self.critic.cuda()
            self.critic_target = self.critic_target.cuda()

        # shared memory
        for i in range(self.n):
            self.actors[i].share_memory()
            self.actors_target[i].share_memory()
        self.critic.share_memory()
        self.critic_target.share_memory()

        # hyper-parameters
        self.tau = args.tau
        self.discount = args.discount
        self.batch_size = args.batch_size
        self.reward_scale = args.reward_scale

    def train(self, iterations, actor_index):

        for _ in tqdm(range(iterations)):

            # Sample replay buffer
            states, n_states, actions, rewards, dones = self.memory.sample(
                self.batch_size)

            # Q target = reward + discount * Q(next_state, pi(next_state))
            with torch.no_grad():
                target_Q = self.critic_target(
                    n_states, self.actors_target[actor_index](n_states))
                target_Q = self.reward_scale * rewards + \
                    (1 - dones) * self.discount * target_Q

            # Get current Q estimate
            current_Q = self.critic(states, actions)

            # Compute critic loss
            critic_loss = self.criterion(current_Q, target_Q)

            # Optimize the critic
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()

            # Compute actor loss
            actor_loss = - \
                self.critic(states, self.actors[actor_index](states)).mean()

            # Optimize the actor
            self.actors_optimizer[actor_index].zero_grad()
            actor_loss.backward()
            self.actors_optimizer[actor_index].step()

            # Update the frozen target models
            for param, target_param in zip(self.critic.parameters(),
                                           self.critic_target.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

            for param, target_param in zip(
                    self.actors[actor_index].parameters(),
                    self.actors_target[actor_index].parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

    def load(self, filename):
        for i in range(self.n):
            self.actors[i].load_model(filename, "actor_" + str(i))
        self.critic.load_model(filename, "critic")

    def save(self, output):
        for i in range(self.n):
            self.actors[i].save_model(output, "actor_" + str(i))
        self.critic.save_model(output, "critic")
Beispiel #5
0
class Agent:
    def __init__(self,env, env_params, args, models=None, record_episodes=[0,.1,.25,.5,.75,1.]):
        self.env= env
        self.env_params = env_params
        self.args = args


        # networks
        if models == None:
                self.actor = Actor(self.env_params).double()
                self.critic = Critic(self.env_params).double()
        else:
                self.actor , self.critic = self.LoadModels()
        # target networks used to predict env actions with
        self.actor_target = Actor(self.env_params,).double()
        self.critic_target = Critic(self.env_params).double()

        self.actor_target.load_state_dict(self.actor.state_dict())
        self.critic_target.load_state_dict(self.critic.state_dict())

        if self.args.cuda:
            self.actor.cuda()
            self.critic.cuda()
            self.actor_target.cuda()
            self.critic_target.cuda()


        self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=0.001)
        self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=0.001)

        self.normalize = Normalizer(env_params,self.args.gamma)
        self.buffer = ReplayBuffer(1_000_000, self.env_params)
        self.tensorboard = ModifiedTensorBoard(log_dir = f"logs")
        self.record_episodes = [int(eps * self.args.n_epochs) for eps in record_episodes]

    def ModelsEval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def ModelsTrain(self):
        self.actor.train()
        self.actor_target.train()
        self.critic.train()
        self.critic_target.train()

    def GreedyAction(self, state):
        self.ModelsEval()
        with torch.no_grad():
            state = torch.tensor(state, dtype=torch.double).unsqueeze(dim=0)
            if self.args.cuda:
                state = state.cuda()
            action = self.actor.forward(state).detach().cpu().numpy().squeeze()
        return action

    def NoiseAction(self, state):
        self.ModelsEval()
        with torch.no_grad():
            state = torch.tensor(state, dtype=torch.double).unsqueeze(dim=0)
            if self.args.cuda:
                state = state.cuda()
            action = self.actor.forward(state).detach().cpu().numpy()
            action += self.args.noise_eps * self.env_params['max_action'] * np.random.randn(*action.shape)
            action = np.clip(action, -self.env_params['max_action'], self.env_params['max_action'])
        return action.squeeze()

    def Update(self):
        self.ModelsTrain()
        for i in range(self.args.n_batch):
            state, a_batch, r_batch, nextstate, d_batch = self.buffer.SampleBuffer(self.args.batch_size)
            a_batch = torch.tensor(a_batch,dtype=torch.double)
            r_batch = torch.tensor(r_batch,dtype=torch.double)
            # d_batch = torch.tensor(d_batch,dtype=torch.double)
            state = torch.tensor(state,dtype=torch.double)
            nextstate = torch.tensor(nextstate,dtype=torch.double)
            # d_batch = 1 - d_batch

            if self.args.cuda:
                a_batch = a_batch.cuda()
                r_batch = r_batch.cuda()
                # d_batch = d_batch.cuda()
                state = state.cuda()
                nextstate = nextstate.cuda()

            with torch.no_grad():
                action_next = self.actor_target.forward(nextstate)
                q_next = self.critic_target.forward(nextstate,action_next)
                q_next = q_next.detach().squeeze()
                q_target = r_batch + self.args.gamma * q_next
                q_target = q_target.detach().squeeze()

            q_prime = self.critic.forward(state, a_batch).squeeze()
            critic_loss = F.mse_loss(q_target, q_prime)

            action = self.actor.forward(state)
            actor_loss = -self.critic.forward(state, action).mean()
            # params = torch.cat([x.view(-1) for x in self.actor.parameters()])
            # l2_reg = self.args.l2_norm *torch.norm(params,2)
            # actor_loss += l2_reg

            self.actor_optim.zero_grad()
            actor_loss.backward()
            self.actor_optim.step()

            self.critic_optim.zero_grad()
            critic_loss.backward()
            self.critic_optim.step()

        self.SoftUpdateTarget(self.critic, self.critic_target)
        self.SoftUpdateTarget(self.actor, self.actor_target)

    def Explore(self):
        for epoch in range(self.args.n_epochs +1):
            start_time = time.process_time()
            for cycle in range(self.args.n_cycles):
                for _ in range(self.args.num_rollouts_per_mpi):
                    state = self.env.reset()
                    for t in range(self.env_params['max_timesteps']):
                        action = self.NoiseAction(state)
                        nextstate, reward, done, info = self.env.step([action])
                        nextstate = nextstate.squeeze()
                        reward = self.normalize.normalize_reward(reward)
                        self.buffer.StoreTransition(state, action, reward, nextstate, done)
                        state = nextstate
                    self.Update()
            avg_reward = self.Evaluate()
            self.tensorboard.step = epoch
            elapsed_time = time.process_time() - start_time
            print(f"Epoch {epoch} of total of {self.args.n_epochs +1} epochs, average reward is: {avg_reward}.\
                    Elapsedtime: {int(elapsed_time /60)} minutes {int(elapsed_time %60)} seconds")
            if epoch % 5 or epoch + 1 == self.args.n_epochs:
                self.SaveModels(epoch)
                self.record(epoch)


    def Evaluate(self):
        self.ModelsEval()
        total_reward = []
        episode_reward = 0
        succes_rate = []
        for episode in range(self.args.n_evaluate):
            state = self.env.reset()
            episode_reward = 0
            for t in range(self.env_params['max_timesteps']):
                action = self.GreedyAction(state)
                nextstate, reward, done, info = self.env.step([action])
                episode_reward += reward
                state = nextstate
                if done or t + 1 == self.env_params['max_timesteps']:
                    total_reward.append(episode_reward)
                    episode_reward = 0

        average_reward = sum(total_reward)/len(total_reward)
        min_reward = min(total_reward)
        max_reward = max(total_reward)
        self.tensorboard.update_stats(reward_avg=average_reward, reward_min=min_reward, reward_max=max_reward)
        return average_reward

    def record(self, epoch):
        self.ModelsEval()
        try:
            if not os.path.exists("videos"):
                os.mkdir('videos')
            recorder = VideoRecorder(self.env, path=f'videos/epoch-{epoch}.mp4')
            for _ in range(self.args.n_record):
                done =False
                state = self.env.reset()
                while not done:
                    recorder.capture_frame()
                    action = self.GreedyAction(state)
                    nextstate,reward,done,info = self.env.step([action])
                    state = nextstate
                recorder.close()
        except Exception as e:
            print(e)

    def SaveModels(self, ep):
        if not os.path.exists("models"):
            os.mkdir('models')
        torch.save(self.actor.state_dict(), os.path.join('models', 'Actor.pt'))
        torch.save(self.critic.state_dict(), os.path.join('models', 'Critic.pt'))

    def LoadModels(self, actorpath, criticpath):
        actor = Actor(self.env_params, self.hidden_neurons)
        critic  = Critic(self.env_params, self.hidden_neurons)
        actor.load_state_dict(torch.load(actorpath))
        critic.load_state_dict(torch.load(criticpath))
        return actor, critic

    def SoftUpdateTarget(self, source, target):
        for target_param, param in zip(target.parameters(), source.parameters()):
            target_param.data.copy_((1 - self.args.polyak) * param.data + self.args.polyak * target_param.data)
Beispiel #6
0
class ddpg_agent:
    def __init__(self, args, env):
        self.args = args
        self.env = env
        # get the number of inputs...
        num_inputs = self.env.observation_space.shape[0]
        num_actions = self.env.action_space.shape[0]
        self.action_scale = self.env.action_space.high[0]
        # build up the network
        self.actor_net = Actor(num_inputs, num_actions)
        self.critic_net = Critic(num_inputs, num_actions)
        # get the target network...
        self.actor_target_net = Actor(num_inputs, num_actions)
        self.critic_target_net = Critic(num_inputs, num_actions)
        if self.args.cuda:
            self.actor_net.cuda()
            self.critic_net.cuda()
            self.actor_target_net.cuda()
            self.critic_target_net.cuda()
        # copy the parameters..
        self.actor_target_net.load_state_dict(self.actor_net.state_dict())
        self.critic_target_net.load_state_dict(self.critic_net.state_dict())
        # setup the optimizer...
        self.optimizer_actor = torch.optim.Adam(self.actor_net.parameters(),
                                                lr=self.args.actor_lr)
        self.optimizer_critic = torch.optim.Adam(
            self.critic_net.parameters(),
            lr=self.args.critic_lr,
            weight_decay=self.args.critic_l2_reg)
        # setting up the noise
        self.ou_noise = OUNoise(num_actions)
        # check some dir
        if not os.path.exists(self.args.save_dir):
            os.mkdir(self.args.save_dir)
        self.model_path = self.args.save_dir + self.args.env_name + '/'
        if not os.path.exists(self.model_path):
            os.mkdir(self.model_path)

    # start to train the network..
    def learn(self):
        # init the brain memory
        replay_buffer = []
        total_timesteps = 0
        running_reward = None
        for episode_idx in range(self.args.max_episode):
            state = self.env.reset()
            # get the scale of the ou noise...
            self.ou_noise.scale = (self.args.noise_scale - self.args.final_noise_scale) * max(0, self.args.exploration_length - episode_idx) / \
                                self.args.exploration_length + self.args.final_noise_scale
            self.ou_noise.reset()
            # start the training
            reward_total = 0
            while True:
                state_tensor = torch.tensor(state,
                                            dtype=torch.float32).unsqueeze(0)
                if self.args.cuda:
                    state_tensor = state_tensor.cuda()
                with torch.no_grad():
                    policy = self.actor_net(state_tensor)
                # start to select the actions...
                actions = self._select_actions(policy)
                # step
                state_, reward, done, _ = self.env.step(actions *
                                                        self.action_scale)
                total_timesteps += 1
                reward_total += reward
                # start to store the samples...
                replay_buffer.append((state, reward, actions, done, state_))
                # check if the buffer size is outof range
                if len(replay_buffer) > self.args.replay_size:
                    replay_buffer.pop(0)
                if len(replay_buffer) > self.args.batch_size:
                    mini_batch = random.sample(replay_buffer,
                                               self.args.batch_size)
                    # start to update the network
                    _, _ = self._update_network(mini_batch)
                if done:
                    break
                state = state_
            running_reward = reward_total if running_reward is None else running_reward * 0.99 + reward_total * 0.01
            if episode_idx % self.args.display_interval == 0:
                torch.save(self.actor_net.state_dict(),
                           self.model_path + 'model.pt')
                print('[{}] Episode: {}, Frames: {}, Rewards: {}'.format(
                    datetime.now(), episode_idx, total_timesteps,
                    running_reward))

        self.env.close()

    # select actions
    def _select_actions(self, policy):
        actions = policy.detach().cpu().numpy()[0]
        actions = actions + self.ou_noise.noise()
        actions = np.clip(actions, -1, 1)
        return actions

    # update the network
    def _update_network(self, mini_batch):
        state_batch = np.array([element[0] for element in mini_batch])
        state_batch = torch.tensor(state_batch, dtype=torch.float32)
        # reward batch
        reward_batch = np.array([element[1] for element in mini_batch])
        reward_batch = torch.tensor(reward_batch,
                                    dtype=torch.float32).unsqueeze(1)
        # done batch
        done_batch = np.array([int(element[3]) for element in mini_batch])
        done_batch = 1 - done_batch
        done_batch = torch.tensor(done_batch, dtype=torch.float32).unsqueeze(1)
        # action batch
        actions_batch = np.array([element[2] for element in mini_batch])
        actions_batch = torch.tensor(actions_batch, dtype=torch.float32)
        # next stsate
        state_next_batch = np.array([element[4] for element in mini_batch])
        state_next_batch = torch.tensor(state_next_batch, dtype=torch.float32)
        # check if use the cuda
        if self.args.cuda:
            state_batch = state_batch.cuda()
            reward_batch = reward_batch.cuda()
            done_batch = done_batch.cuda()
            actions_batch = actions_batch.cuda()
            state_next_batch = state_next_batch.cuda()

        # update the critic network...
        with torch.no_grad():
            actions_out = self.actor_target_net(state_next_batch)
            expected_q_value = self.critic_target_net(state_next_batch,
                                                      actions_out)
        # get the target value
        target_value = reward_batch + self.args.gamma * expected_q_value * done_batch
        target_value = target_value.detach()
        values = self.critic_net(state_batch, actions_batch)
        critic_loss = (target_value - values).pow(2).mean()
        self.optimizer_critic.zero_grad()
        critic_loss.backward()
        self.optimizer_critic.step()
        # start to update the actor network
        actor_loss = -self.critic_net(state_batch,
                                      self.actor_net(state_batch)).mean()
        self.optimizer_actor.zero_grad()
        actor_loss.backward()
        self.optimizer_actor.step()
        # then, start to softupdate the network...
        self._soft_update_target_network(self.critic_target_net,
                                         self.critic_net)
        self._soft_update_target_network(self.actor_target_net, self.actor_net)

        return actor_loss.item(), critic_loss.item()

    # soft update the network
    def _soft_update_target_network(self, target, source):
        # update the critic network firstly...
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(self.args.tau * param.data +
                                    (1 - self.args.tau) * target_param.data)

    # functions to test the network
    def test_network(self):
        model_path = self.args.save_dir + self.args.env_name + '/model.pt'
        self.actor_net.load_state_dict(
            torch.load(model_path, map_location=lambda storage, loc: storage))
        self.actor_net.eval()
        # start to test
        for _ in range(5):
            state = self.env.reset()
            reward_sum = 0
            while True:
                self.env.render()
                state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
                with torch.no_grad():
                    actions = self.actor_net(state)
                actions = actions.detach().numpy()[0]
                state_, reward, done, _ = self.env.step(self.action_scale *
                                                        actions)
                reward_sum += reward
                if done:
                    break
                state = state_
            print('The reward of this episode is {}.'.format(reward_sum))
        self.env.close()