コード例 #1
0
ファイル: ddqn.py プロジェクト: zynk13/Deep-RL-Keras
class DDQN:
    """ Deep Q-Learning Main Algorithm
    """

    def __init__(self, action_dim, state_dim, args):
        """ Initialization
        """
        # Environment and DDQN parameters
        self.with_per = args.with_per
        self.action_dim = action_dim
        self.state_dim = (args.consecutive_frames,) + state_dim
        #
        self.lr = 2.5e-4
        self.gamma = 0.95
        self.epsilon = 0.8
        self.epsilon_decay = 0.99
        self.buffer_size = 20000
        #
        if(len(state_dim) < 3):
            self.tau = 1e-2
        else:
            self.tau = 1.0
        # Create actor and critic networks
        self.agent = Agent(self.state_dim, action_dim, self.lr, self.tau, args.dueling)
        # Memory Buffer for Experience Replay
        self.buffer = MemoryBuffer(self.buffer_size, args.with_per)

    def policy_action(self, s):
        """ Apply an espilon-greedy policy to pick next action
        """
        if random() <= self.epsilon:
            return randrange(self.action_dim)
        else:
            return np.argmax(self.agent.predict(s)[0])

    def train_agent(self, batch_size):
        """ Train Q-network on batch sampled from the buffer
        """
        # Sample experience from memory buffer (optionally with PER)
        s, a, r, d, new_s, idx = self.buffer.sample_batch(batch_size)

        # Apply Bellman Equation on batch samples to train our DDQN
        q = self.agent.predict(s)
        next_q = self.agent.predict(new_s)
        q_targ = self.agent.target_predict(new_s)

        for i in range(s.shape[0]):
            old_q = q[i, a[i]]
            if d[i]:
                q[i, a[i]] = r[i]
            else:
                next_best_action = np.argmax(next_q[0,:])
                q[i, a[i]] = r[i] + self.gamma * q_targ[i, next_best_action]
            if(self.with_per):
                # Update PER Sum Tree
                self.buffer.update(idx[i], abs(old_q - q[i, a[i]]))
        # Train on batch
        self.agent.fit(s, q)
        # Decay epsilon
        self.epsilon *= self.epsilon_decay


    def train(self, env, args, summary_writer):
        """ Main DDQN Training Algorithm
        """

        results = []
        tqdm_e = tqdm(range(args.nb_episodes), desc='Score', leave=True, unit=" episodes")

        for e in tqdm_e:
            # Reset episode
            time, cumul_reward, done  = 0, 0, False
            old_state = env.reset()

            while not done:
                if args.render: env.render()
                # Actor picks an action (following the policy)
                a = self.policy_action(old_state)
                # Retrieve new state, reward, and whether the state is terminal
                new_state, r, done, _ = env.step(a)
                # Memorize for experience replay
                self.memorize(old_state, a, r, done, new_state)
                # Update current state
                old_state = new_state
                cumul_reward += r
                time += 1
                # Train DDQN and transfer weights to target network
                if(self.buffer.size() > args.batch_size):
                    self.train_agent(args.batch_size)
                    self.agent.transfer_weights()

            # Gather stats every episode for plotting
            if(args.gather_stats):
                mean, stdev = gather_stats(self, env)
                results.append([e, mean, stdev])

            # Export results for Tensorboard
            score = tfSummary('score', cumul_reward)
            summary_writer.add_summary(score, global_step=e)
            summary_writer.flush()

            # Display score
            tqdm_e.set_description("Score: " + str(cumul_reward))
            tqdm_e.refresh()

        return results

    def memorize(self, state, action, reward, done, new_state):
        """ Store experience in memory buffer
        """

        if(self.with_per):
            q_val = self.agent.predict(new_state)
            q_val_t = self.agent.target_predict(new_state)
            next_best_action = np.argmax(q_val)
            new_val = reward + self.gamma * q_val_t[0, next_best_action]
            td_error = abs(new_val - q_val)[0]
        else:
            td_error = 0
        self.buffer.memorize(state, action, reward, done, new_state, td_error)
コード例 #2
0
class DDQN:
    """ Deep Q-Learning Main Algorithm          深度Q学习主要算法
    """
    def __init__(self, action_dim, state_dim, args):
        """ Initialization      初始化
        """
        # Environment and DDQN parameters       环境和DDQN参数
        self.with_per = args.with_per
        self.action_dim = action_dim
        self.state_dim = (args.consecutive_frames, ) + state_dim
        #
        self.lr = 2.5e-4
        self.gamma = 0.95
        self.epsilon = 0.8
        self.epsilon_decay = 0.99
        self.buffer_size = 20000
        #
        if (len(state_dim) < 3):
            self.tau = 1e-2
        else:
            self.tau = 1.0
        # Create actor and critic networks      建立演员和评论家网络
        self.agent = Agent(self.state_dim, action_dim, self.lr, self.tau,
                           args.dueling)
        # Memory Buffer for Experience Replay   用于经验重播的内存缓冲区
        self.buffer = MemoryBuffer(self.buffer_size, args.with_per)

    def policy_action(self, s):
        """ Apply an espilon-greedy policy to pick next action          应用epsilon-greedy策略选择下一步操作
        """
        if random() <= self.epsilon:
            return randrange(self.action_dim)
        else:
            return np.argmax(self.agent.predict(s)[0])

    def train_agent(self, batch_size):
        """ Train Q-network on batch sampled from the buffer            从缓冲区采样的批次训练Q网络
        """
        # Sample experience from memory buffer (optionally with PER)    来自内存缓冲区的示例体验(可选配PER)
        s, a, r, d, new_s, idx = self.buffer.sample_batch(batch_size)

        # Apply Bellman Equation on batch samples to train our DDQN     在批次样本里应用Bellman方程来训练我们的DDQN
        q = self.agent.predict(s)
        next_q = self.agent.predict(new_s)
        q_targ = self.agent.target_predict(new_s)

        for i in range(s.shape[0]):
            old_q = q[i, a[i]]
            if d[i]:
                q[i, a[i]] = r[i]
            else:
                next_best_action = np.argmax(next_q[i, :])
                q[i, a[i]] = r[i] + self.gamma * q_targ[i, next_best_action]
            if (self.with_per):
                # Update PER Sum Tree                   更新PER Sum树
                self.buffer.update(idx[i], abs(old_q - q[i, a[i]]))
        # Train on batch                                批量训练
        self.agent.fit(s, q)
        # Decay epsilon                                 衰变epsilon
        self.epsilon *= self.epsilon_decay

    def train(self, env, args, summary_writer):
        """ Main DDQN Training Algorithm                DDQN主要训练算法
        """

        results = []
        tqdm_e = tqdm(range(args.nb_episodes),
                      desc='Score',
                      leave=True,
                      unit=" episodes")

        for e in tqdm_e:
            # Reset episode                             重设episode
            time, cumul_reward, done = 0, 0, False
            old_state = env.reset()

            while not done:
                if args.render: env.render()
                # Actor picks an action (following the policy)                      演员选择动作(遵循政策)
                a = self.policy_action(old_state)
                # Retrieve new state, reward, and whether the state is terminal     检索新状态,奖励以及该状态是否为终端
                new_state, r, done, _ = env.step(a)
                # Memorize for experience replay                                    保存经验重播
                self.memorize(old_state, a, r, done, new_state)
                # Update current state                                              更新当前状态
                old_state = new_state
                cumul_reward += r
                time += 1
                # Train DDQN and transfer weights to target network                 训练DDQN并将权重转移到目标网络
                if (self.buffer.size() > args.batch_size):
                    self.train_agent(args.batch_size)
                    self.agent.transfer_weights()

            # Gather stats every episode for plotting                   收集每个情节的统计数据以进行绘图
            if (args.gather_stats):
                mean, stdev = gather_stats(self, env)
                results.append([e, mean, stdev])

            # Export results for Tensorboard                            为Tensorboard导出结果
            score = tfSummary('score', cumul_reward)
            summary_writer.add_summary(score, global_step=e)
            summary_writer.flush()

            # Display score             显示分数
            tqdm_e.set_description("Score: " + str(cumul_reward))
            tqdm_e.refresh()

        return results

    def memorize(self, state, action, reward, done, new_state):
        """ Store experience in memory buffer                           将经验存储在内存缓冲区中
        """

        if (self.with_per):
            q_val = self.agent.predict(state)
            q_val_t = self.agent.target_predict(new_state)
            next_best_action = np.argmax(self.agent.predict(new_state))
            new_val = reward + self.gamma * q_val_t[0, next_best_action]
            td_error = abs(new_val - q_val)[0]
        else:
            td_error = 0
        self.buffer.memorize(state, action, reward, done, new_state, td_error)

    def save_weights(self, path):
        path += '_LR_{}'.format(self.lr)
        if (self.with_per):
            path += '_PER'
        self.agent.save(path)

    def load_weights(self, path):
        self.agent.load_weights(path)
コード例 #3
0
class DDPG:
    """ Deep Deterministic Policy Gradient (DDPG) Helper Class
    """

    def __init__(self, act_dim, env_dim, act_range, k, buffer_size = 20000, gamma = 0.99, lr = 0.00005, tau = 0.001):
        """ Initialization
        """
        # Environment and A2C parameters
        self.act_dim = act_dim
        self.act_range = act_range
        self.env_dim = (k,) + env_dim
        self.gamma = gamma
        self.lr = lr
        # Create actor and critic networks
        self.actor = Actor(self.env_dim, act_dim, act_range, 0.1 * lr, tau)
        self.critic = Critic(self.env_dim, act_dim, lr, tau)
        self.buffer = MemoryBuffer(buffer_size)

    def policy_action(self, s):
        """ Use the actor to predict value
        """
        return self.actor.predict(s)[0]

    def bellman(self, rewards, q_values, dones):
        """ Use the Bellman Equation to compute the critic target
        """
        critic_target = np.asarray(q_values)
        for i in range(q_values.shape[0]):
            if dones[i]:
                critic_target[i] = rewards[i]
            else:
                critic_target[i] = rewards[i] + self.gamma * q_values[i]
        return critic_target

    def memorize(self, state, action, reward, done, new_state):
        """ Store experience in memory buffer
        """
        self.buffer.memorize(state, action, reward, done, new_state)

    def sample_batch(self, batch_size):
        return self.buffer.sample_batch(batch_size)

    def update_models(self, states, actions, critic_target):
        """ Update actor and critic networks from sampled experience
        """
        # Train critic
        self.critic.train_on_batch(states, actions, critic_target)
        # Q-Value Gradients under Current Policy
        actions = self.actor.model.predict(states)
        grads = self.critic.gradients(states, actions)
        # Train actor
        self.actor.train(states, actions, np.array(grads).reshape((-1, self.act_dim)))
        # Transfer weights to target networks at rate Tau
        self.actor.transfer_weights()
        self.critic.transfer_weights()

    def train(self, env, args, summary_writer):
        results = []

        # First, gather experience
        tqdm_e = tqdm(range(args.nb_episodes), desc='Score', leave=True, unit=" episodes")
        for e in tqdm_e:

            # Reset episode
            time, cumul_reward, done = 0, 0, False
            old_state = env.reset()
            actions, states, rewards = [], [], []
            noise = OrnsteinUhlenbeckProcess(size=self.act_dim)

            while not done:
                if args.render: env.render()
                # Actor picks an action (following the deterministic policy)
                a = self.policy_action(old_state)
                # Clip continuous values to be valid w.r.t. environment
                a = np.clip(a+noise.generate(time), -self.act_range, self.act_range)
                # Retrieve new state, reward, and whether the state is terminal
                new_state, r, done, _ = env.step(a)
                # Add outputs to memory buffer
                self.memorize(old_state, a, r, done, new_state)
                # Sample experience from buffer
                states, actions, rewards, dones, new_states, _ = self.sample_batch(args.batch_size)
                # Predict target q-values using target networks
                q_values = self.critic.target_predict([new_states, self.actor.target_predict(new_states)])
                # Compute critic target
                critic_target = self.bellman(rewards, q_values, dones)
                # Train both networks on sampled batch, update target networks
                self.update_models(states, actions, critic_target)
                # Update current state
                old_state = new_state
                cumul_reward += r
                time += 1

            # Gather stats every episode for plotting
            if(args.gather_stats):
                mean, stdev = gather_stats(self, env)
                results.append([e, mean, stdev])

            # Export results for Tensorboard
            score = tfSummary('score', cumul_reward)
            summary_writer.add_summary(score, global_step=e)
            summary_writer.flush()
            # Display score
            tqdm_e.set_description("Score: " + str(cumul_reward))
            tqdm_e.refresh()

        return results

    def save_weights(self, path):
        path += '_LR_{}'.format(self.lr)
        self.actor.save(path)
        self.critic.save(path)

    def load_weights(self, path_actor, path_critic):
        self.critic.load_weights(path_critic)
        self.actor.load_weights(path_actor)
コード例 #4
0
class DDQN:
    """ Deep Q-Learning Main Algorithm
    """
    def __init__(self, action_dim, state_dim, args):
        """ Initialization
        """
        # Environment and DDQN parameters
        self.with_per = args.with_per
        self.action_dim = action_dim
        self.state_dim = state_dim
        #
        self.lr = 2.5e-4
        self.gamma = 0.95
        self.epsilon = 0.8
        self.epsilon_decay = 0.99
        self.buffer_size = 20000
        #
        self.tau = 1e-2

        # Create actor and critic networks
        self.agent = Agent(self.state_dim, action_dim, self.lr, self.tau,
                           args.dueling, args.hidden_dim)
        # Memory Buffer for Experience Replay
        self.buffer = MemoryBuffer(self.buffer_size, args.with_per)

    def policy_action(self, s):
        """ Apply an espilon-greedy policy to pick next action
        """
        if random() <= self.epsilon:
            return randrange(self.action_dim)
        else:
            return np.argmax(self.agent.predict(s)[0])

    def train_agent(self, batch_size):
        """ Train Q-network on batch sampled from the buffer
        """
        # Sample experience from memory buffer (optionally with PER)
        s, a, r, d, new_s, idx = self.buffer.sample_batch(batch_size)

        # Apply Bellman Equation on batch samples to train our DDQN
        q = self.agent.predict(s)
        next_q = self.agent.predict(new_s)
        q_targ = self.agent.target_predict(new_s)

        for i in range(s.shape[0]):
            old_q = q[i, a[i]]
            if d[i]:
                q[i, a[i]] = r[i]
            else:
                next_best_action = np.argmax(next_q[i, :])
                q[i, a[i]] = r[i] + self.gamma * q_targ[i, next_best_action]
            if (self.with_per):
                # Update PER Sum Tree
                self.buffer.update(idx[i], abs(old_q - q[i, a[i]]))
        # Train on batch
        self.agent.fit(s, q)
        # Decay epsilon
        self.epsilon *= self.epsilon_decay

    def train(self, env, args, summary_writer, envtest=None):
        """ Main DDQN Training Algorithm
        """

        results = []
        tqdm_e = tqdm(range(args.nb_episodes),
                      desc='Score',
                      leave=True,
                      unit=" episodes")
        epoch = 0
        gross_profit = 0
        WritetoCsvFile("logFile_1.csv", [
            "stage", "file", "history_win", "stop", "usevol", "dueling",
            "traineval", "allprices", "allprices2", "allprices3", "ma1", "ma2",
            "madifference", "hidema", "candlenum", "hidden_dim", "maxProfit",
            "maxLOSS", "avgProfit", "avgLOSS", "countprofit", "countloss",
            "maxdrop", "Total profit", "total_reward", "TRADES", "epoch"
        ])
        WritetoCsvFile("logFileDetail.csv", [
            "stage", "file", "history_win", "stop", "usevol", "dueling",
            "traineval", "allprices", "allprices2", "allprices3", "ma1", "ma2",
            "madifference", "hidema", "candlenum", "hidden_dim", 'maxProfit',
            'maxLOSS', 'avgProfit', 'avgLOSS', 'maxdrop', 'Total profit',
            'gross profit', "total_reward", 'TRADES', 'epoch'
        ])

        for e in tqdm_e:
            # Reset episode
            time, cumul_reward, done = 0, 0, False
            old_state = env.reset()
            ##########################################
            total_reward = 0
            total_profit = 0
            total_loss = 0
            total_profitMax = 0
            total_profitMin = 0
            max_drop = 0
            profitLst = []
            lossLst = []
            trades = 0
            step = 0
            #####################################3####

            while not done:
                #if args.render: env.render()
                # Actor picks an action (following the policy)
                a = self.policy_action(old_state)
                # Retrieve new state, reward, and whether the state is terminal
                #new_state, r, done, _ = env.step(a)

                #######################################################
                new_state, r, done, buy, sell, profit = env.step(a)

                total_reward += r
                if profit != 0:
                    trades += 1
                    total_profit += profit
                    if total_profit > total_profitMax:
                        total_profitMax = total_profit
                        total_profitMin = total_profit
                    if total_profit < total_profitMin:
                        total_profitMin = total_profit
                        try:
                            if total_profitMax != 0 and max_drop < (
                                    total_profitMax -
                                    total_profitMin) / total_profitMax:
                                max_drop = (total_profitMax -
                                            total_profitMin) / total_profitMax
                        except:
                            max_drop = 0

                if profit > 0:
                    profitLst.append(profit)
                elif profit < 0:
                    lossLst.append(profit)

                step += 1
                if step % 1500 == 0:
                    print(
                        'maxProfit: {} maxLOSS: {} avgProfit: {:01.2f} avgLOSS: {:01.2f} maxdrop: {:.2%} Total profit: {}/{} TRADES: {}  '
                        .format(np.max(profitLst + [0]),
                                -np.min(lossLst + [0]),
                                np.mean(profitLst + [0]),
                                -np.mean(lossLst + [0]), max_drop,
                                total_profit, gross_profit, trades))

                    WritetoCsvFile("logFileDetail.csv", [
                        "train", args.trainf, args.history_win, args.stop,
                        args.usevol, args.dueling, args.traineval,
                        args.allprices, args.allprices2, args.allprices3,
                        args.ma1, args.ma2, args.madifference, args.hidema,
                        args.candlenum, args.hidden_dim,
                        np.max(profitLst + [0]), -np.min(lossLst + [0]),
                        np.mean(profitLst + [0]), -np.mean(lossLst + [0]),
                        max_drop, total_profit, gross_profit, total_reward,
                        trades, epoch
                    ])
                #done = True if step == len(env.data) - 3 else False
                ######################################################
                # Memorize for experience replay
                self.memorize(old_state, a, r, done, new_state)
                # Update current state
                old_state = new_state
                cumul_reward += r
                time += 1
                # Train DDQN and transfer weights to target network
                if (self.buffer.size() > args.batch_size):
                    self.train_agent(args.batch_size)
                    self.agent.transfer_weights()

            gross_profit += total_profit
            # Gather stats every episode for plotting
            if (args.gather_stats):
                mean, stdev = gather_stats(self, env)
                results.append([e, mean, stdev])

            # Export results for Tensorboard
            score = tfSummary('score', cumul_reward)
            l_profit = tfSummary('profit', total_profit)
            l_aprofit = tfSummary('average profit', np.mean(profitLst))
            l_aloss = tfSummary('l_aloss', -np.mean(lossLst))
            l_trades = tfSummary('l_trades', trades)
            np.mean(profitLst), -np.mean(lossLst)
            summary_writer.add_summary(score, global_step=e)
            summary_writer.add_summary(l_profit, global_step=e)
            summary_writer.add_summary(l_aprofit, global_step=e)
            summary_writer.add_summary(l_aloss, global_step=e)
            summary_writer.add_summary(l_trades, global_step=e)
            summary_writer.flush()

            # Display score
            tqdm_e.set_description("Score: " + str(cumul_reward))
            tqdm_e.refresh()
            self.agent.saveModel("./models/model_ep", "")
            results = [
                np.max(profitLst + [0]), -np.min(lossLst + [0]),
                np.mean(profitLst + [0]), -np.mean(lossLst + [0]),
                len(profitLst),
                len(lossLst), max_drop, total_profit, total_reward, trades
            ]

            WritetoCsvFile("logFile_1.csv", [
                "train", args.trainf, args.history_win, args.stop, args.usevol,
                args.dueling, args.traineval, args.allprices, args.allprices2,
                args.allprices3, args.ma1, args.ma2, args.madifference,
                args.hidema, args.candlenum, args.hidden_dim
            ] + results + [epoch])
            if envtest:  # Если задано окружение для тестирования то тестируем каждую эпоху
                newargs = args
                newargs.traineval = False
                self.evaluate(envtest,
                              newargs,
                              summary_writer,
                              model=None,
                              epoch=epoch)

            epoch += 1
        return results

    def memorize(self, state, action, reward, done, new_state):
        """ Store experience in memory buffer
        """

        if (self.with_per):
            q_val = self.agent.predict(state)
            q_val_t = self.agent.target_predict(new_state)
            next_best_action = np.argmax(q_val)
            new_val = reward + self.gamma * q_val_t[0, next_best_action]
            td_error = abs(new_val - q_val)[0]
        else:
            td_error = 0
        self.buffer.memorize(state, action, reward, done, new_state, td_error)

    def evaluate(self, env, args, summary_writer, model, epoch=0):
        """ Evaluate            """
        results = []
        if model:
            self.agent.loadModel_versoin(model, "")
        done = False
        old_state = env.reset()
        ##########################################
        total_reward = 0
        total_profit = 0
        total_loss = 0
        total_profitMax = 0
        total_profitMin = 0
        max_drop = 0
        profitLst = []
        lossLst = []
        step = 0
        trades = 0
        #####################################3####
        while not done:
            # if args.render: env.render()
            # Actor picks an action (following the policy)
            a = self.policy_action(old_state)
            # Retrieve new state, reward, and whether the state is terminal
            new_state, r, done, buy, sell, profit = env.step(a)

            #######################################################
            total_reward += r
            if profit != 0:
                trades += 1
                total_profit += profit
                if total_profit > total_profitMax:
                    total_profitMax = total_profit
                    total_profitMin = total_profit
                if total_profit < total_profitMin:
                    total_profitMin = total_profit
                    try:
                        if total_profitMax != 0 and max_drop < (
                                total_profitMax -
                                total_profitMin) / total_profitMax:
                            max_drop = (total_profitMax -
                                        total_profitMin) / total_profitMax
                    except:
                        max_drop = 0
            if profit > 0:
                profitLst.append(profit)

            elif profit < 0:
                lossLst.append(profit)
            step += 1
            if step % 1500 == 0:
                print(
                    'maxProfit: {} maxLOSS: {} avgProfit: {:01.2f} avgLOSS: {:01.2f} maxdrop: {:.2%} Total profit: {}  Total reward: {}  TRADES: {}  '
                    .format(np.max(profitLst + [0]), -np.min(lossLst + [0]),
                            np.mean(profitLst + [0]), -np.mean(lossLst + [0]),
                            max_drop, total_profit, total_reward, trades))
                WritetoCsvFile("logFileDetail.csv", [
                    "eval", args.trainf, args.history_win, args.stop,
                    args.usevol, args.dueling, args.traineval, args.allprices,
                    args.allprices2, args.allprices3, args.ma1, args.ma2,
                    args.madifference, args.hidema, args.candlenum,
                    args.hidden_dim,
                    np.max(profitLst + [0]), -np.min(lossLst + [0]),
                    np.mean(profitLst + [0]), -np.mean(lossLst + [0]),
                    max_drop, total_profit, total_profit, total_reward, trades,
                    epoch
                ])
            #done = True if step == len(env.data) - 2 else False
            ######################################################
            # Memorize for experience replay
            if args.traineval:
                self.memorize(old_state, a, r, done, new_state)
                # Train DDQN and transfer weights to target network
                if (self.buffer.size() > args.batch_size):
                    self.train_agent(args.batch_size)
                    self.agent.transfer_weights()
            # Update current state
            old_state = new_state
        print(
            'maxProfit: {} maxLOSS: {} avgProfit: {:01.2f} avgLOSS: {:01.2f} maxdrop: {:.2%} Total profit: {} Total reward: {} TRADES: {}  '
            .format(np.max(profitLst + [0]), -np.min(lossLst + [0]),
                    np.mean(profitLst + [0]), -np.mean(lossLst + [0]),
                    max_drop, total_profit, total_reward, trades))
        results = [
            np.max(profitLst + [0]), -np.min(lossLst + [0]),
            np.mean(profitLst + [0]), -np.mean(lossLst + [0]),
            len(profitLst),
            len(lossLst), max_drop, total_profit, total_reward, trades
        ]
        WritetoCsvFile("logFile_1.csv", [
            "eval", args.trainf, args.history_win, args.stop, args.usevol,
            args.dueling, args.traineval, args.allprices, args.allprices2,
            args.allprices3, args.ma1, args.ma2, args.madifference,
            args.hidema, args.candlenum, args.hidden_dim
        ] + results + [epoch])
        return results
コード例 #5
0
ファイル: ddpg.py プロジェクト: CUN-bjy/gym-ddpg-keras
class ddpgAgent():
    """Deep Deterministic Policy Gradient(DDPG) Agent
	"""
    def __init__(self, env_, is_discrete=False, batch_size=100, w_per=True):
        # gym environments
        self.env = env_
        self.discrete = is_discrete
        self.obs_dim = env_.observation_space.shape[0]
        self.act_dim = env_.action_space.n if is_discrete else env_.action_space.shape[
            0]

        self.action_bound = (env_.action_space.high - env_.action_space.low
                             ) / 2 if not is_discrete else 1.
        self.action_shift = (env_.action_space.high + env_.action_space.low
                             ) / 2 if not is_discrete else 0.

        # initialize actor & critic and its targets
        self.discount_factor = 0.99
        self.actor = ActorNet(self.obs_dim,
                              self.act_dim,
                              self.action_bound,
                              lr_=1e-4,
                              tau_=1e-3)
        self.critic = CriticNet(self.obs_dim,
                                self.act_dim,
                                lr_=1e-3,
                                tau_=1e-3,
                                discount_factor=self.discount_factor)

        # Experience Buffer
        self.buffer = MemoryBuffer(BUFFER_SIZE, with_per=w_per)
        self.with_per = w_per
        self.batch_size = batch_size
        # OU-Noise-Process
        self.noise = OrnsteinUhlenbeckProcess(size=self.act_dim)

    ###################################################
    # Network Related
    ###################################################
    def make_action(self, obs, t, noise=True):
        """ predict next action from Actor's Policy
		"""
        action_ = self.actor.predict(obs)[0]
        a = np.clip(action_ + self.noise.generate(t) if noise else 0,
                    -self.action_bound, self.action_bound)
        return a

    def update_networks(self, obs, acts, critic_target):
        """ Train actor & critic from sampled experience
		"""
        # update critic
        self.critic.train(obs, acts, critic_target)

        # get next action and Q-value Gradient
        n_actions = self.actor.network.predict(obs)
        q_grads = self.critic.Qgradient(obs, n_actions)

        # update actor
        self.actor.train(obs, self.critic.network, q_grads)

        # update target networks
        self.actor.target_update()
        self.critic.target_update()

    def replay(self, replay_num_):
        if self.with_per and (self.buffer.size() <= self.batch_size): return

        for _ in range(replay_num_):
            # sample from buffer
            states, actions, rewards, dones, new_states, idx = self.sample_batch(
                self.batch_size)

            # get target q-value using target network
            q_vals = self.critic.target_predict(
                [new_states, self.actor.target_predict(new_states)])

            # bellman iteration for target critic value
            critic_target = np.asarray(q_vals)
            for i in range(q_vals.shape[0]):
                if dones[i]:
                    critic_target[i] = rewards[i]
                else:
                    critic_target[
                        i] = self.discount_factor * q_vals[i] + rewards[i]

                if self.with_per:
                    self.buffer.update(idx[i],
                                       abs(q_vals[i] - critic_target[i]))

            # train(or update) the actor & critic and target networks
            self.update_networks(states, actions, critic_target)

    ####################################################
    # Buffer Related
    ####################################################

    def memorize(self, obs, act, reward, done, new_obs):
        """store experience in the buffer
		"""
        if self.with_per:
            q_val = self.critic.network(
                [np.expand_dims(obs, axis=0),
                 self.actor.predict(obs)])
            next_action = self.actor.target_network.predict(
                np.expand_dims(new_obs, axis=0))
            q_val_t = self.critic.target_predict(
                [np.expand_dims(new_obs, axis=0), next_action])
            new_val = reward + self.discount_factor * q_val_t
            td_error = abs(new_val - q_val)[0]
        else:
            td_error = 0

        self.buffer.memorize(obs, act, reward, done, new_obs, td_error)

    def sample_batch(self, batch_size):
        """ Sampling from the batch
		"""
        return self.buffer.sample_batch(batch_size)

    ###################################################
    # Save & Load Networks
    ###################################################
    def save_weights(self, path):
        """ Agent's Weights Saver
		"""
        self.actor.save_network(path)
        self.critic.save_network(path)

    def load_weights(self, pretrained):
        """ Agent's Weights Loader
		"""
        self.actor.load_network(pretrained)
        self.critic.load_network(pretrained)
コード例 #6
0
class DDPG:
    """ Deep Deterministic Policy Gradient (DDPG) Helper Class
    """
    def __init__(self,
                 act_dim,
                 env_dim,
                 act_range,
                 k,
                 buffer_size=20000,
                 gamma=0.99,
                 lr=0.00005,
                 tau=0.001):
        """ Initialization
        """
        # Environment and A2C parameters
        self.act_dim = act_dim
        self.act_range = act_range
        self.env_dim = (40, )
        self.gamma = gamma
        self.lr = lr
        # Create actor and critic networks
        self.actor = Actor(self.env_dim, act_dim, act_range, 0.1 * lr, tau)
        self.critic = Critic(self.env_dim, act_dim, lr, tau)
        self.buffer = MemoryBuffer(buffer_size)

    def policy_action(self, s):
        """ Use the actor to predict value
        """
        return self.actor.predict(s)[0]

    def bellman(self, rewards, q_values, dones):
        """ Use the Bellman Equation to compute the critic target
        """
        critic_target = np.asarray(q_values)
        for i in range(q_values.shape[0]):
            if dones[i]:
                critic_target[i] = rewards[i]
            else:
                critic_target[i] = rewards[i] + self.gamma * q_values[i]
        return critic_target

    def memorize(self, state, action, reward, done, new_state):
        """ Store experience in memory buffer
        """
        self.buffer.memorize(state, action, reward, done, new_state)

    def sample_batch(self, batch_size):
        return self.buffer.sample_batch(batch_size)

    def update_models(self, states, actions, critic_target):
        """ Update actor and critic networks from sampled experience
        """
        # Train critic
        self.critic.train_on_batch(states, actions, critic_target)
        # Q-Value Gradients under Current Policy
        actions = self.actor.model.predict(states)
        grads = self.critic.gradients(states, actions)
        # Train actor
        self.actor.train(states, actions,
                         np.array(grads).reshape((-1, self.act_dim)))
        # Transfer weights to target networks at rate Tau
        self.actor.transfer_weights()
        self.critic.transfer_weights()

    def train(self, summary_writer):
        env = CarEnv()
        results = []
        i = 0
        # First, gather experience
        tqdm_e = tqdm(range(2000), desc='Score', leave=True, unit=" episodes")
        for e in tqdm_e:
            # Reset episode
            time, cumul_reward, done = 0, 0, False
            old_state = env.reset()
            old_state = np.array(old_state).reshape(40, )
            actions, states, rewards = [], [], []
            noise = OrnsteinUhlenbeckProcess(size=self.act_dim)

            while not done:
                # if args.render: env.render()
                # Actor picks an action (following the deterministic policy)
                a = self.policy_action(old_state)
                # Clip continuous values to be valid w.r.t. environment
                a = np.clip(a + noise.generate(time), -self.act_range,
                            self.act_range)
                a = float(a[0])
                # Retrieve new state, reward, and whether the state is terminal
                new_state, r, done, _ = env.step(a, time)
                print("Now r is {}".format(r))
                # Add outputs to memory buffer
                temp_next = old_state.copy()
                temp_next[:4] = temp_next[4:8]
                temp_next[4:8] = temp_next[8:12]
                temp_next[8:12] = temp_next[12:16]
                temp_next[12:16] = temp_next[16:20]
                temp_next[16:20] = temp_next[20:24]
                temp_next[20:24] = temp_next[24:28]
                temp_next[24:28] = temp_next[28:32]
                temp_next[28:32] = temp_next[32:36]
                temp_next[32:36] = temp_next[36:40]
                temp_next[36:40] = new_state
                temp_next = np.array(temp_next).reshape(40, )
                self.memorize(old_state, a, r, done, temp_next)
                old_state = temp_next.copy()
                cumul_reward += r
                time += 1

            # since episode is over destroying actors in the scenario
            for actor in env.actor_list:
                actor.destroy()
            # Sample experience from buffer
            for i in range(50):
                states, actions, rewards, dones, new_states, _ = self.sample_batch(
                    64)
                # Predict target q-values using target networks
                q_values = self.critic.target_predict(
                    [new_states,
                     self.actor.target_predict(new_states)])
                # Compute critic target
                critic_target = self.bellman(rewards, q_values, dones)
                # Train both networks on sampled batch, update target networks
                self.update_models(states, actions, critic_target)
                print("learning happened")

            # mean, stdev, velocity_data, action_data, vehicle_x_data, vehicle_y_data, obj_x_data, obj_y_data = gather_stats(self, env, velocity_data, action_data, vehicle_x_data, vehicle_y_data, obj_x_data, obj_y_data)
            mean, stdev = gather_stats(self, env)
            results.append([e, mean, stdev])

            # Export results for Tensorboard
            print(cumul_reward)
            score = tfSummary('score', cumul_reward)
            summary_writer.add_summary(score, global_step=e)
            summary_writer.flush()
            # Display score
            tqdm_e.set_description("Score: " + str(cumul_reward))
            tqdm_e.refresh()
            i += 1
            if i % 10 == 0:
                df = pd.DataFrame(np.array(results))
                df.to_csv("DDPG" + "/logs.csv",
                          header=['Episode', 'Mean', 'Stddev'],
                          float_format='%10.5f')

        return results

    def save_weights(self, path):
        path += '_LR_{}'.format(self.lr)
        self.actor.save(path)
        self.critic.save(path)

    def load_weights(self, path_actor, path_critic):
        self.critic.load_weights(path_critic)
        self.actor.load_weights(path_actor)
コード例 #7
0
class Agent:
    """ Stock Trading Bot """
    def __init__(self,
                 buffer_size,
                 state_size,
                 action_size=3,
                 learning_rate=0.001):

        # agent config
        self.buffer = MemoryBuffer(buffer_size, True)
        self.state_size = state_size
        self.action_size = action_size
        self.inventory = []

        # model config
        self.gamma = 0.95  # affinity for long term reward
        self.loss = huber_loss
        self.optimizer = Adam(lr=learning_rate)

        # target network
        self.model = self._model()
        self.target_model = clone_model(self.model)
        self.target_model.set_weights(self.model.get_weights())

    def _model(self):

        inputs = Input(shape=self.state_size)
        x = Dense(64, activation='relu')(inputs)
        x = Dense(128, activation='relu')(x)

        value = Dense(self.action_size, activation='linear')(x)
        a = Dense(self.action_size, activation='linear')(x)
        meam = Lambda(lambda x: K.mean(x, axis=1, keepdims=True))(a)
        advantage = Subtract()([a, meam])
        q = Add()([value, advantage])

        model = Model(inputs=inputs, outputs=q)
        model.compile(loss=self.loss, optimizer=self.optimizer)
        return model

    def remember(self, state, action, reward, next_state, done):

        self.memory.append((state, action, reward, next_state, done))

    def act(self, state, epsilon, is_eval=False):

        # take random action in order to diversify experience at the beginning
        if not is_eval and random.random() <= epsilon:
            return random.randrange(self.action_size)

        state = state.reshape((-1, ) + self.state_size)
        action_probs = self.model.predict(state)
        return np.argmax(action_probs[0])

    def epsilon_decay(self, epsilon, epsilon_min, epsilon_decay):
        if epsilon > epsilon_min:
            epsilon *= epsilon_decay
        return epsilon

    def remember_sumtree(
        self,
        state,
        action,
        reward,
        new_state,
        done,
    ):

        state = state.reshape((-1, ) + self.state_size)
        new_state = new_state.reshape((-1, ) + self.state_size)

        q_val = self.model.predict(state)
        q_val_t = self.target_model.predict(new_state)
        next_best_action = np.argmax(q_val)
        new_val = reward + self.gamma * q_val_t[0, next_best_action]
        td_error = abs(new_val - q_val + 1e-8)[0]

        self.buffer.memorize(state, action, reward, done, new_state, td_error)

    def target_model_update(self,
                            done,
                            tau=0.1,
                            type='reset',
                            reset_every=5000):

        if type == 'reset':
            if self.n_iter % reset_every == 0:
                print('update target model')
                # reset target model weights
                self.target_model.set_weights(self.model.get_weights())

        if type == 'transfer':
            if done:
                W = self.model.get_weights()
                tgt_W = self.target_model.get_weights()
                for i in range(len(W)):
                    tgt_W[i] = tau * W[i] + (1 - tau) * tgt_W[i]
                self.target_model.set_weights(tgt_W)

    def train_experience_replay_sumtree(
        self,
        batch_size,
    ):

        state, action, reward, done, new_state, idx = self.buffer.sample_batch(
            batch_size)

        state = state.reshape((-1, ) + self.state_size)
        new_state = new_state.reshape((-1, ) + self.state_size)

        q = self.model.predict(state)

        next_q = self.model.predict(new_state)
        q_targ = self.target_model.predict(new_state)

        for i in range(state.shape[0]):
            old_q = q[i, action[i]]
            if done[i]:
                q[i, action[i]] = reward[i]
            else:
                next_best_action = np.argmax(next_q[i, :])
                q[i, action[i]] = reward[i] + self.gamma * q_targ[
                    i, next_best_action]
            self.buffer.update(idx[i], abs(old_q - q[i, action[i]]))

        loss = self.model.fit((state), q, epochs=1,
                              verbose=0).history["loss"][0]

        return loss

    def save(self, name):
        if not os.path.exists('save/' + name):
            os.makedirs('save/' + name)
            np.save('save/' + name + '/data.npy', self.buffer.buffer.data)
            np.save('save/' + name + '/tree.npy', self.buffer.buffer.tree)
            self.model.save('save/' + name + '/model.h5')
            self.target_model.save('save/' + name + '/target_model.h5')
        else:
            print('already exist, please check.')

    def load(self, name):
        if not os.path.exists('save/' + name):
            print('not exist, please check.')
        else:
            self.buffer.buffer.data = np.load('save/' + name + '/data.npy',
                                              allow_pickle=True)
            self.buffer.buffer.tree = np.load('save/' + name + '/tree.npy',
                                              allow_pickle=True)
            self.model = load_model('save/' + name + '/model.h5')
            self.target_model = load_model('save/' + name + '/target_model.h5')
コード例 #8
0
class DDPG(object):
    """ Deep Deterministic Policy Gradient (DDPG) Helper Class
    """
    def __init__(self,
                 action_dim,
                 state_dim,
                 batch_size,
                 step,
                 buffer_size,
                 train_indicator,
                 episode,
                 gamma,
                 lra,
                 lrc,
                 tau,
                 load_weight=True):
        """ Initialization
        """
        # Environment and A2C parameters
        self.action_dim = action_dim
        self.state_dim = state_dim
        self.batch_size = batch_size
        self.step = step
        self.gamma = gamma
        self.lra = lra
        self.lrc = lrc
        self.tau = tau
        self.episode = episode
        self.train_indicator = train_indicator
        # Create actor and critic networks
        self.actor = Actor(state_dim, action_dim, batch_size, lra, tau)
        self.critic = Critic(state_dim, action_dim, batch_size, lrc, tau)
        self.buffer = MemoryBuffer(buffer_size)
        # !: weights folder need to be specified & ensure only one set of A&C weights are in this folder
        self.weights_dir_path = os.getcwd() + r"\saved_model\*.h5"

        if load_weight:
            try:
                weights_actor_path = ""
                weights_critic_path = ""
                weights_file_path = glob.glob(self.weights_dir_path)

                for file_path in weights_file_path:
                    if file_path.find("actor") < 0:
                        weights_critic_path = file_path
                    if file_path.find("critic") < 0:
                        weights_actor_path = file_path

                self.load_weights(weights_actor_path, weights_critic_path)

                print("")
                print("Actor-Critic Models are loaded with weights...")
                print("")
            except:
                print("")
                print(
                    "Weights are failed to be loaded, please check weights loading path..."
                )
                print("")

    def policy_action(self, s):
        """ Use the actor to predict value
        """
        return self.actor.predict(s)[0]

    def bellman(self, rewards, q_values, dones):
        """ Use the Bellman Equation to compute the critic target (one action only)
        """
        critic_target = np.asarray(q_values)
        for i in range(q_values.shape[0]):
            if dones[i]:
                critic_target[i] = rewards[i]
            else:
                critic_target[i] = rewards[i] + self.gamma * q_values[i]
        return critic_target

    def memorize(self, state_old, action, reward, done, state_new):
        """ Store experience in memory buffer
        """
        self.buffer.memorize(state_old, action, reward, done, state_new)

    def sample_batch(self, batch_size):
        return self.buffer.sample_batch(batch_size)

    def update_models(self, states, actions, critic_target):
        """ Update actor and critic networks from sampled experience
        """
        # Train critic
        self.critic.train_on_batch(states, actions, critic_target)
        # Q-Value Gradients under Current Policy
        actions = self.actor.model.predict(states)
        grads = self.critic.gradients(states, actions)
        # Train actor
        self.actor.train(states, actions,
                         np.array(grads).reshape((-1, self.action_dim)))
        # Transfer weights to target networks at rate Tau
        self.actor.transfer_weights()
        self.critic.transfer_weights()

    def run(self, env):
        # First, gather experience
        for e in range(self.episode):
            # Reset episode
            # set initial state
            loss, cumul_reward, cumul_loss = 0, 0, 0
            done = False
            state_old = env.get_vissim_state(
                1, 180 * 5, [45, 55, 60, 65, 70, 75, 80
                             ])  #TODO: make sure states are recieved correctly
            actions, states, rewards = [], [], []

            print("Episode: ", e, " ========================:")

            for t in range(self.step):
                action_original = self.policy_action(state_old)

                #TODO: OU function params?
                noise = OrnsteinUhlenbeckProcess(x0=action_original,
                                                 size=self.action_dim)

                # action = action_orig + noise
                action = noise.apply_ou(t)

                # adjust too-low or too-high action
                adj_action = np.zeros(len(action))
                for index, value in enumerate(action):
                    adj_action[index] = clip(value, -1, 1)

                #action_mapping function
                transformed_action = Transformation.convert_actions(adj_action)

                reward, state_new = env.get_vissim_reward(
                    180 * 5, transformed_action)

                # TODO: if we know what the optimal discharging rate, then we set that as done
                if t == self.step - 1:  #we consider the manually setted last step as done
                    done = True

                # ======================================================================================= Training section
                if (self.train_indicator):
                    # Add outputs to memory buffer
                    self.memorize(state_old, adj_action, reward, done,
                                  state_new)
                    # Sample experience from buffer
                    states_old, actions, rewards, dones, states_new = self.sample_batch(
                        self.batch_size)
                    # Predict target q-values using target networks
                    q_values = self.critic.target_predict(
                        [states_new,
                         self.actor.target_predict(states_new)])
                    # Compute critic target
                    critic_target = self.bellman(rewards, q_values, dones)
                    # Train both networks on sampled batch, update target networks
                    self.update_models(states_old, actions, critic_target)
                    # calculate loss
                    loss = self.critic.train_on_batch(states_old, actions,
                                                      critic_target)
                    state_old = state_new
                    cumul_reward += reward
                    cumul_loss += loss
                # =======================================================================================

                # ======================================================================================= report
                print("|---> Step: ", t, " | Action: ", transformed_action,
                      " | Reward: ", reward, " | Loss: ", loss)
                # =======================================================================================

            # ======================================================================================= save model
            if np.mod(e, 10) == 0:
                print("====================> Saving model...")
                self.save_weights("./saved_model/")
                """
                with open("actormodel.json", "w") as outfile:
                    json.dump(actor.model.to_json(), outfile)
                with open("criticmodel.json", "w") as outfile:
                    json.dump(critic.model.to_json(), outfile)
                """
            # ======================================================================================= save model

            print("")
            print("*-------------------------------------------------*")
            print("Average Accumulated Reward: " +
                  str(cumul_reward / self.step))
            print("Average Accumulated Loss: " + str(cumul_loss / self.step))
            print("*-------------------------------------------------*")
            print("")

            # garbage recycling
            gc.collect()

    def save_weights(self, path):
        t = datetime.datetime.now()
        time = "_" + str(t.date()) + "_" + str(t.hour) + "h-" + str(
            t.minute) + "m"
        path_actor = path + '_LR_{}'.format(self.lra) + time
        path_critic = path + '_LR_{}'.format(self.lrc) + time
        self.actor.save(path_actor)
        self.critic.save(path_critic)

    def load_weights(self, path_actor, path_critic):
        self.actor.load(path_actor)
        self.critic.load(path_critic)
コード例 #9
0
class DDQN:
    """ Deep Q-Learning Main Algorithm
    """
    def __init__(self, action_dim, state_dim, args, input_size, hp,
                 export_path, env):
        """ Initialization
        """

        self.export_path = export_path

        # Environment and DDQN parameters
        self.with_per = args.with_per
        self.action_dim = action_dim
        self.state_dim = (args.consecutive_frames, ) + state_dim
        #
        self.lr = hp["lr"]
        self.gamma = 0.99
        # Exploration parameters for epsilon greedy strategy
        self.explore_start = self.epsilon = 1.0  # exploration probability at start
        self.explore_stop = 0.1  # minimum exploration probability
        self.decay_rate = 0.000001  # exponential decay rate for exploration prob

        self.buffer_size = 20000
        self.input_size = input_size

        self.video_dir = args.video_dir

        # Create actor and critic networks
        self.agent = Agent(self.state_dim, action_dim, self.lr, args.dueling,
                           input_size, args.load)
        # Memory Buffer for Experience Replay
        self.buffer = MemoryBuffer(self.buffer_size, args.with_per)

        try:
            # Init buffer
            threads = 16
            p = Pool(processes=threads)
            while self.buffer.size() < self.buffer_size:

                # Set up threaded frame accumulation
                buffers = p.map_async(init_buffer, [env] * threads)
                datas = buffers.get()

                # Record in global memory
                for data in datas:
                    for entry in data:
                        self.memorize(*entry)

                # Mitigate memory leak
                del buffers
                del datas

                print("Buffer size: {}".format(self.buffer.size()))

        except KeyboardInterrupt:
            p.close()
            p.join()
        p.close()
        p.join()

        # Train on pure randomness for a while
        tqdm_e = tqdm(range(2000), desc='Score', leave=True, unit=" episodes")
        for e in tqdm_e:
            record = False
            if e % 100 == 0: record = True
            self.train_agent(args.batch_size, record)

            if e % 1000 == 0:
                self.agent.transfer_weights()

            # Display score
            tqdm_e.refresh()

    def policy_action(self, s):
        """ Apply an espilon-greedy policy to pick next action
        """
        if np.random.random() <= self.epsilon:
            return np.random.randint(self.action_dim)
        else:
            a_vect = self.agent.predict(s)[0]
            return np.argmax(a_vect)

    def train_agent(self, batch_size, record=False):
        """ Train Q-network on batch sampled from the buffer
        """
        # Sample experience from memory buffer (optionally with PER)
        s, a, r, d, new_s, idx = self.buffer.sample_batch(batch_size)

        # Apply Bellman Equation on batch samples to train our DDQN
        q = self.agent.predict(s)
        next_q = self.agent.predict(new_s)
        q_targ = self.agent.target_predict(new_s)

        for i in range(s.shape[0]):
            old_q = q[i, a[i]]
            if d[i]:
                q[i, a[i]] = r[i]
            else:
                next_best_action = np.argmax(next_q[i, :])
                q[i, a[i]] = r[i] + self.gamma * q_targ[i, next_best_action]

        # Train on batch
        self.agent.fit(s, q, record=record)

    def train(self, env, args):
        """ Main DDQN Training Algorithm
        """

        results = []
        tqdm_e = tqdm(range(args.nb_episodes),
                      desc='Score',
                      leave=True,
                      unit=" episodes")

        decay_step = 0
        self.t = 0
        for e in tqdm_e:
            # Reset episode
            time, cumul_reward, cumul_r_r, done = 0, 0, 0, False
            position = deque(maxlen=50)
            position.append(0)
            old_state = env.reset()

            while not done:
                decay_step += 1
                env.render()
                # Actor picks an action (following the policy)
                a = self.policy_action(old_state)

                # Retrieve new state, reward, and whether the state is terminal
                new_state, r, done, _ = env.step(a)

                # Memorize for experience replay
                if r == 0: r_r = 0
                elif r > 0: r_r = 1
                else: r_r = -1

                # Reward for not staying in place
                if a == 2: position.append(position[-1] + 1)
                if a == 3: position.append(position[-1] - 1)
                r_w = abs(max(position) - min(position)) / 10000
                r_r += r_w

                self.memorize(old_state, a, r_r, done, new_state)

                # Update current state
                old_state = new_state
                cumul_reward += r
                cumul_r_r += r_r
                time += 1

                self.epsilon = self.explore_stop + (
                    self.explore_start - self.explore_stop) * np.exp(
                        -self.decay_rate * decay_step)

                # Train DDQN
                if (self.buffer.size() >
                        args.batch_size) and self.t % 2000 == 0:
                    self.train_agent(args.batch_size)
                self.t += 1

                if self.t % 10000 == 0:
                    self.agent.transfer_weights()

            if e % 50 == 0:
                self.agent.save("./model.h5")
                wandb.save("./model.h5")

            if e % 100 == 0:
                # wandb logging
                evaluate(cumul_reward, self.epsilon)
                self.train_agent(args.batch_size, record=True)

            # Display score
            text = "Score: {}, Fake Score: {:.2f}".format(
                str(cumul_reward), cumul_r_r)
            tqdm_e.set_description(text)
            tqdm_e.refresh()

            # render gameplay video
            if (e % 50 == 0):
                mp4list = glob.glob('video/' + self.video_dir + '/*.mp4')
                if len(mp4list) > 0:
                    mp4 = mp4list[-1]
                    video = io.open(mp4, 'r+b').read()
                    encoded = base64.b64encode(video)
                    # log gameplay video in wandb
                    wandb.log(
                        {"gameplays": wandb.Video(mp4, fps=4, format="gif")})

        return results

    def memorize(self, state, action, reward, done, new_state):
        """ Store experience in memory buffer
        """

        if (self.with_per):
            q_val = self.agent.predict(state)
            q_val_t = self.agent.target_predict(new_state)
            next_best_action = np.argmax(q_val)
            new_val = reward + self.gamma * q_val_t[0, next_best_action]
            td_error = abs(new_val - q_val)[0]
        else:
            td_error = 0
        self.buffer.memorize(state, action, reward, done, new_state, td_error)

    def save(self, path):
        self.agent.save(path)

    def load_weights(self, path):
        self.agent.load_weights(path)
コード例 #10
0
class td3Agent():
    """Twin Delayed Deep Deterministic Policy Gradient(TD3) Agent
	"""
    def __init__(self,
                 env_,
                 is_discrete=False,
                 batch_size=100,
                 w_per=True,
                 update_delay=2):
        # gym environments
        self.env = env_
        self.discrete = is_discrete
        self.obs_dim = env_.observation_space.shape[0]
        self.act_dim = env_.action_space.n if is_discrete else env_.action_space.shape[
            0]

        self.action_bound = (env_.action_space.high - env_.action_space.low
                             ) / 2 if not is_discrete else 1.
        self.action_shift = (env_.action_space.high + env_.action_space.low
                             ) / 2 if not is_discrete else 0.

        # initialize actor & critic and its targets
        self.discount_factor = 0.99
        self.actor = ActorNet(self.obs_dim,
                              self.act_dim,
                              self.action_bound,
                              lr_=3e-4,
                              tau_=5e-3)
        self.critic = CriticNet(self.obs_dim,
                                self.act_dim,
                                lr_=3e-4,
                                tau_=5e-3,
                                discount_factor=self.discount_factor)

        # Experience Buffer
        self.buffer = MemoryBuffer(BUFFER_SIZE, with_per=w_per)
        self.with_per = w_per
        self.batch_size = batch_size
        # OU-Noise-Process
        self.noise = OrnsteinUhlenbeckProcess(size=self.act_dim)

        # for Delayed Policy Update
        self._update_step = 0
        self._target_update_interval = update_delay

    ###################################################
    # Network Related
    ###################################################
    def make_action(self, obs, t, noise=True):
        """ predict next action from Actor's Policy
		"""
        action_ = self.actor.predict(obs)[0]
        sigma = 0.1  # std of gaussian
        a = np.clip(
            action_ +
            np.random.normal(0, self.action_bound * sigma) if noise else 0,
            -self.action_bound, self.action_bound)
        #a = np.clip(action_ + self.noise.generate(t) if noise else 0, -self.action_bound, self.action_bound)
        return a

    def make_target_action(self, obs, noise=True):
        """ predict next action from Actor's Target Policy
		"""
        action_ = self.actor.target_predict(obs)
        sigma = 0.2
        #return action_
        cliped_noise = np.clip(np.random.normal(0, self.action_bound * sigma),
                               -self.action_bound * 0.5,
                               self.action_bound * 0.5)
        a = np.clip(action_ + cliped_noise if noise else 0, -self.action_bound,
                    self.action_bound)
        return a

    def update_networks(self, obs, acts, critic_target):
        """ Train actor & critic from sampled experience
		"""
        # update critic
        self.critic.train(obs, acts, critic_target)
        if self._update_step % self._target_update_interval == 0:
            # update actor
            self.actor.train(obs, self.critic.network_1)

            # update target networks
            self.actor.target_update()
            self.critic.target_update()
        self._update_step = self._update_step + 1

    def train(self):
        if self.with_per and (self.buffer.size() <= self.batch_size): return

        # sample from buffer
        states, actions, rewards, dones, new_states, idx = self.sample_batch(
            self.batch_size)

        # get target q-value using target network
        new_actions = self.make_target_action(new_states)
        q1_vals = self.critic.target_network_1.predict(
            [new_states, new_actions])
        q2_vals = self.critic.target_network_2.predict(
            [new_states, new_actions])

        # bellman iteration for target critic value
        q_vals = np.min(np.vstack([q1_vals.transpose(),
                                   q2_vals.transpose()]),
                        axis=0)
        critic_target = np.asarray(q_vals)
        # print(np.vstack([q1_vals.transpose(),q2_vals.transpose()]))
        # print(q_vals)
        for i in range(q1_vals.shape[0]):
            if dones[i]:
                critic_target[i] = rewards[i]
            else:
                critic_target[
                    i] = self.discount_factor * q_vals[i] + rewards[i]

            if self.with_per:
                self.buffer.update(idx[i], abs(q_vals[i] - critic_target[i]))

        # train(or update) the actor & critic and target networks
        self.update_networks(states, actions, critic_target)

    ####################################################
    # Buffer Related
    ####################################################

    def memorize(self, obs, act, reward, done, new_obs):
        """store experience in the buffer
		"""
        if self.with_per:
            # not implemented for td3, yet.
            q_val = self.critic.network(
                [np.expand_dims(obs, axis=0),
                 self.actor.predict(obs)])
            next_action = self.actor.target_network.predict(
                np.expand_dims(new_obs, axis=0))
            q_val_t = self.critic.target_network.predict(
                [np.expand_dims(new_obs, axis=0), next_action])
            new_val = reward + self.discount_factor * q_val_t
            td_error = abs(new_val - q_val)[0]
        else:
            td_error = 0

        self.buffer.memorize(obs, act, reward, done, new_obs, td_error)

    def sample_batch(self, batch_size):
        """ Sampling from the batch
		"""
        return self.buffer.sample_batch(batch_size)

    ###################################################
    # Save & Load Networks
    ###################################################
    def save_weights(self, path):
        """ Agent's Weights Saver
		"""
        self.actor.save_network(path)
        self.critic.save_network(path)

    def load_weights(self, pretrained):
        """ Agent's Weights Loader
		"""
        self.actor.load_network(pretrained)
        self.critic.load_network(pretrained)