Esempio n. 1
0
class DDQN:
    """ Deep Q-Learning Main Algorithm          深度Q学习主要算法
    """
    def __init__(self, action_dim, state_dim, args):
        """ Initialization      初始化
        """
        # Environment and DDQN parameters       环境和DDQN参数
        self.with_per = args.with_per
        self.action_dim = action_dim
        self.state_dim = (args.consecutive_frames, ) + state_dim
        #
        self.lr = 2.5e-4
        self.gamma = 0.95
        self.epsilon = 0.8
        self.epsilon_decay = 0.99
        self.buffer_size = 20000
        #
        if (len(state_dim) < 3):
            self.tau = 1e-2
        else:
            self.tau = 1.0
        # Create actor and critic networks      建立演员和评论家网络
        self.agent = Agent(self.state_dim, action_dim, self.lr, self.tau,
                           args.dueling)
        # Memory Buffer for Experience Replay   用于经验重播的内存缓冲区
        self.buffer = MemoryBuffer(self.buffer_size, args.with_per)

    def policy_action(self, s):
        """ Apply an espilon-greedy policy to pick next action          应用epsilon-greedy策略选择下一步操作
        """
        if random() <= self.epsilon:
            return randrange(self.action_dim)
        else:
            return np.argmax(self.agent.predict(s)[0])

    def train_agent(self, batch_size):
        """ Train Q-network on batch sampled from the buffer            从缓冲区采样的批次训练Q网络
        """
        # Sample experience from memory buffer (optionally with PER)    来自内存缓冲区的示例体验(可选配PER)
        s, a, r, d, new_s, idx = self.buffer.sample_batch(batch_size)

        # Apply Bellman Equation on batch samples to train our DDQN     在批次样本里应用Bellman方程来训练我们的DDQN
        q = self.agent.predict(s)
        next_q = self.agent.predict(new_s)
        q_targ = self.agent.target_predict(new_s)

        for i in range(s.shape[0]):
            old_q = q[i, a[i]]
            if d[i]:
                q[i, a[i]] = r[i]
            else:
                next_best_action = np.argmax(next_q[i, :])
                q[i, a[i]] = r[i] + self.gamma * q_targ[i, next_best_action]
            if (self.with_per):
                # Update PER Sum Tree                   更新PER Sum树
                self.buffer.update(idx[i], abs(old_q - q[i, a[i]]))
        # Train on batch                                批量训练
        self.agent.fit(s, q)
        # Decay epsilon                                 衰变epsilon
        self.epsilon *= self.epsilon_decay

    def train(self, env, args, summary_writer):
        """ Main DDQN Training Algorithm                DDQN主要训练算法
        """

        results = []
        tqdm_e = tqdm(range(args.nb_episodes),
                      desc='Score',
                      leave=True,
                      unit=" episodes")

        for e in tqdm_e:
            # Reset episode                             重设episode
            time, cumul_reward, done = 0, 0, False
            old_state = env.reset()

            while not done:
                if args.render: env.render()
                # Actor picks an action (following the policy)                      演员选择动作(遵循政策)
                a = self.policy_action(old_state)
                # Retrieve new state, reward, and whether the state is terminal     检索新状态,奖励以及该状态是否为终端
                new_state, r, done, _ = env.step(a)
                # Memorize for experience replay                                    保存经验重播
                self.memorize(old_state, a, r, done, new_state)
                # Update current state                                              更新当前状态
                old_state = new_state
                cumul_reward += r
                time += 1
                # Train DDQN and transfer weights to target network                 训练DDQN并将权重转移到目标网络
                if (self.buffer.size() > args.batch_size):
                    self.train_agent(args.batch_size)
                    self.agent.transfer_weights()

            # Gather stats every episode for plotting                   收集每个情节的统计数据以进行绘图
            if (args.gather_stats):
                mean, stdev = gather_stats(self, env)
                results.append([e, mean, stdev])

            # Export results for Tensorboard                            为Tensorboard导出结果
            score = tfSummary('score', cumul_reward)
            summary_writer.add_summary(score, global_step=e)
            summary_writer.flush()

            # Display score             显示分数
            tqdm_e.set_description("Score: " + str(cumul_reward))
            tqdm_e.refresh()

        return results

    def memorize(self, state, action, reward, done, new_state):
        """ Store experience in memory buffer                           将经验存储在内存缓冲区中
        """

        if (self.with_per):
            q_val = self.agent.predict(state)
            q_val_t = self.agent.target_predict(new_state)
            next_best_action = np.argmax(self.agent.predict(new_state))
            new_val = reward + self.gamma * q_val_t[0, next_best_action]
            td_error = abs(new_val - q_val)[0]
        else:
            td_error = 0
        self.buffer.memorize(state, action, reward, done, new_state, td_error)

    def save_weights(self, path):
        path += '_LR_{}'.format(self.lr)
        if (self.with_per):
            path += '_PER'
        self.agent.save(path)

    def load_weights(self, path):
        self.agent.load_weights(path)
Esempio n. 2
0
class DDQN:
    """ Deep Q-Learning Main Algorithm
    """

    def __init__(self, action_dim, state_dim, args):
        """ Initialization
        """
        # Environment and DDQN parameters
        self.with_per = args.with_per
        self.action_dim = action_dim
        self.state_dim = (args.consecutive_frames,) + state_dim
        #
        self.lr = 2.5e-4
        self.gamma = 0.95
        self.epsilon = 0.8
        self.epsilon_decay = 0.99
        self.buffer_size = 20000
        #
        if(len(state_dim) < 3):
            self.tau = 1e-2
        else:
            self.tau = 1.0
        # Create actor and critic networks
        self.agent = Agent(self.state_dim, action_dim, self.lr, self.tau, args.dueling)
        # Memory Buffer for Experience Replay
        self.buffer = MemoryBuffer(self.buffer_size, args.with_per)

    def policy_action(self, s):
        """ Apply an espilon-greedy policy to pick next action
        """
        if random() <= self.epsilon:
            return randrange(self.action_dim)
        else:
            return np.argmax(self.agent.predict(s)[0])

    def train_agent(self, batch_size):
        """ Train Q-network on batch sampled from the buffer
        """
        # Sample experience from memory buffer (optionally with PER)
        s, a, r, d, new_s, idx = self.buffer.sample_batch(batch_size)

        # Apply Bellman Equation on batch samples to train our DDQN
        q = self.agent.predict(s)
        next_q = self.agent.predict(new_s)
        q_targ = self.agent.target_predict(new_s)

        for i in range(s.shape[0]):
            old_q = q[i, a[i]]
            if d[i]:
                q[i, a[i]] = r[i]
            else:
                next_best_action = np.argmax(next_q[0,:])
                q[i, a[i]] = r[i] + self.gamma * q_targ[i, next_best_action]
            if(self.with_per):
                # Update PER Sum Tree
                self.buffer.update(idx[i], abs(old_q - q[i, a[i]]))
        # Train on batch
        self.agent.fit(s, q)
        # Decay epsilon
        self.epsilon *= self.epsilon_decay


    def train(self, env, args, summary_writer):
        """ Main DDQN Training Algorithm
        """

        results = []
        tqdm_e = tqdm(range(args.nb_episodes), desc='Score', leave=True, unit=" episodes")

        for e in tqdm_e:
            # Reset episode
            time, cumul_reward, done  = 0, 0, False
            old_state = env.reset()

            while not done:
                if args.render: env.render()
                # Actor picks an action (following the policy)
                a = self.policy_action(old_state)
                # Retrieve new state, reward, and whether the state is terminal
                new_state, r, done, _ = env.step(a)
                # Memorize for experience replay
                self.memorize(old_state, a, r, done, new_state)
                # Update current state
                old_state = new_state
                cumul_reward += r
                time += 1
                # Train DDQN and transfer weights to target network
                if(self.buffer.size() > args.batch_size):
                    self.train_agent(args.batch_size)
                    self.agent.transfer_weights()

            # Gather stats every episode for plotting
            if(args.gather_stats):
                mean, stdev = gather_stats(self, env)
                results.append([e, mean, stdev])

            # Export results for Tensorboard
            score = tfSummary('score', cumul_reward)
            summary_writer.add_summary(score, global_step=e)
            summary_writer.flush()

            # Display score
            tqdm_e.set_description("Score: " + str(cumul_reward))
            tqdm_e.refresh()

        return results

    def memorize(self, state, action, reward, done, new_state):
        """ Store experience in memory buffer
        """

        if(self.with_per):
            q_val = self.agent.predict(new_state)
            q_val_t = self.agent.target_predict(new_state)
            next_best_action = np.argmax(q_val)
            new_val = reward + self.gamma * q_val_t[0, next_best_action]
            td_error = abs(new_val - q_val)[0]
        else:
            td_error = 0
        self.buffer.memorize(state, action, reward, done, new_state, td_error)
Esempio n. 3
0
class ddpgAgent():
    """Deep Deterministic Policy Gradient(DDPG) Agent
	"""
    def __init__(self, env_, is_discrete=False, batch_size=100, w_per=True):
        # gym environments
        self.env = env_
        self.discrete = is_discrete
        self.obs_dim = env_.observation_space.shape[0]
        self.act_dim = env_.action_space.n if is_discrete else env_.action_space.shape[
            0]

        self.action_bound = (env_.action_space.high - env_.action_space.low
                             ) / 2 if not is_discrete else 1.
        self.action_shift = (env_.action_space.high + env_.action_space.low
                             ) / 2 if not is_discrete else 0.

        # initialize actor & critic and its targets
        self.discount_factor = 0.99
        self.actor = ActorNet(self.obs_dim,
                              self.act_dim,
                              self.action_bound,
                              lr_=1e-4,
                              tau_=1e-3)
        self.critic = CriticNet(self.obs_dim,
                                self.act_dim,
                                lr_=1e-3,
                                tau_=1e-3,
                                discount_factor=self.discount_factor)

        # Experience Buffer
        self.buffer = MemoryBuffer(BUFFER_SIZE, with_per=w_per)
        self.with_per = w_per
        self.batch_size = batch_size
        # OU-Noise-Process
        self.noise = OrnsteinUhlenbeckProcess(size=self.act_dim)

    ###################################################
    # Network Related
    ###################################################
    def make_action(self, obs, t, noise=True):
        """ predict next action from Actor's Policy
		"""
        action_ = self.actor.predict(obs)[0]
        a = np.clip(action_ + self.noise.generate(t) if noise else 0,
                    -self.action_bound, self.action_bound)
        return a

    def update_networks(self, obs, acts, critic_target):
        """ Train actor & critic from sampled experience
		"""
        # update critic
        self.critic.train(obs, acts, critic_target)

        # get next action and Q-value Gradient
        n_actions = self.actor.network.predict(obs)
        q_grads = self.critic.Qgradient(obs, n_actions)

        # update actor
        self.actor.train(obs, self.critic.network, q_grads)

        # update target networks
        self.actor.target_update()
        self.critic.target_update()

    def replay(self, replay_num_):
        if self.with_per and (self.buffer.size() <= self.batch_size): return

        for _ in range(replay_num_):
            # sample from buffer
            states, actions, rewards, dones, new_states, idx = self.sample_batch(
                self.batch_size)

            # get target q-value using target network
            q_vals = self.critic.target_predict(
                [new_states, self.actor.target_predict(new_states)])

            # bellman iteration for target critic value
            critic_target = np.asarray(q_vals)
            for i in range(q_vals.shape[0]):
                if dones[i]:
                    critic_target[i] = rewards[i]
                else:
                    critic_target[
                        i] = self.discount_factor * q_vals[i] + rewards[i]

                if self.with_per:
                    self.buffer.update(idx[i],
                                       abs(q_vals[i] - critic_target[i]))

            # train(or update) the actor & critic and target networks
            self.update_networks(states, actions, critic_target)

    ####################################################
    # Buffer Related
    ####################################################

    def memorize(self, obs, act, reward, done, new_obs):
        """store experience in the buffer
		"""
        if self.with_per:
            q_val = self.critic.network(
                [np.expand_dims(obs, axis=0),
                 self.actor.predict(obs)])
            next_action = self.actor.target_network.predict(
                np.expand_dims(new_obs, axis=0))
            q_val_t = self.critic.target_predict(
                [np.expand_dims(new_obs, axis=0), next_action])
            new_val = reward + self.discount_factor * q_val_t
            td_error = abs(new_val - q_val)[0]
        else:
            td_error = 0

        self.buffer.memorize(obs, act, reward, done, new_obs, td_error)

    def sample_batch(self, batch_size):
        """ Sampling from the batch
		"""
        return self.buffer.sample_batch(batch_size)

    ###################################################
    # Save & Load Networks
    ###################################################
    def save_weights(self, path):
        """ Agent's Weights Saver
		"""
        self.actor.save_network(path)
        self.critic.save_network(path)

    def load_weights(self, pretrained):
        """ Agent's Weights Loader
		"""
        self.actor.load_network(pretrained)
        self.critic.load_network(pretrained)
Esempio n. 4
0
class DDQN:
    """ Deep Q-Learning Main Algorithm
    """
    def __init__(self, action_dim, state_dim, args):
        """ Initialization
        """
        # Environment and DDQN parameters
        self.with_per = args.with_per
        self.action_dim = action_dim
        self.state_dim = state_dim
        #
        self.lr = 2.5e-4
        self.gamma = 0.95
        self.epsilon = 0.8
        self.epsilon_decay = 0.99
        self.buffer_size = 20000
        #
        self.tau = 1e-2

        # Create actor and critic networks
        self.agent = Agent(self.state_dim, action_dim, self.lr, self.tau,
                           args.dueling, args.hidden_dim)
        # Memory Buffer for Experience Replay
        self.buffer = MemoryBuffer(self.buffer_size, args.with_per)

    def policy_action(self, s):
        """ Apply an espilon-greedy policy to pick next action
        """
        if random() <= self.epsilon:
            return randrange(self.action_dim)
        else:
            return np.argmax(self.agent.predict(s)[0])

    def train_agent(self, batch_size):
        """ Train Q-network on batch sampled from the buffer
        """
        # Sample experience from memory buffer (optionally with PER)
        s, a, r, d, new_s, idx = self.buffer.sample_batch(batch_size)

        # Apply Bellman Equation on batch samples to train our DDQN
        q = self.agent.predict(s)
        next_q = self.agent.predict(new_s)
        q_targ = self.agent.target_predict(new_s)

        for i in range(s.shape[0]):
            old_q = q[i, a[i]]
            if d[i]:
                q[i, a[i]] = r[i]
            else:
                next_best_action = np.argmax(next_q[i, :])
                q[i, a[i]] = r[i] + self.gamma * q_targ[i, next_best_action]
            if (self.with_per):
                # Update PER Sum Tree
                self.buffer.update(idx[i], abs(old_q - q[i, a[i]]))
        # Train on batch
        self.agent.fit(s, q)
        # Decay epsilon
        self.epsilon *= self.epsilon_decay

    def train(self, env, args, summary_writer, envtest=None):
        """ Main DDQN Training Algorithm
        """

        results = []
        tqdm_e = tqdm(range(args.nb_episodes),
                      desc='Score',
                      leave=True,
                      unit=" episodes")
        epoch = 0
        gross_profit = 0
        WritetoCsvFile("logFile_1.csv", [
            "stage", "file", "history_win", "stop", "usevol", "dueling",
            "traineval", "allprices", "allprices2", "allprices3", "ma1", "ma2",
            "madifference", "hidema", "candlenum", "hidden_dim", "maxProfit",
            "maxLOSS", "avgProfit", "avgLOSS", "countprofit", "countloss",
            "maxdrop", "Total profit", "total_reward", "TRADES", "epoch"
        ])
        WritetoCsvFile("logFileDetail.csv", [
            "stage", "file", "history_win", "stop", "usevol", "dueling",
            "traineval", "allprices", "allprices2", "allprices3", "ma1", "ma2",
            "madifference", "hidema", "candlenum", "hidden_dim", 'maxProfit',
            'maxLOSS', 'avgProfit', 'avgLOSS', 'maxdrop', 'Total profit',
            'gross profit', "total_reward", 'TRADES', 'epoch'
        ])

        for e in tqdm_e:
            # Reset episode
            time, cumul_reward, done = 0, 0, False
            old_state = env.reset()
            ##########################################
            total_reward = 0
            total_profit = 0
            total_loss = 0
            total_profitMax = 0
            total_profitMin = 0
            max_drop = 0
            profitLst = []
            lossLst = []
            trades = 0
            step = 0
            #####################################3####

            while not done:
                #if args.render: env.render()
                # Actor picks an action (following the policy)
                a = self.policy_action(old_state)
                # Retrieve new state, reward, and whether the state is terminal
                #new_state, r, done, _ = env.step(a)

                #######################################################
                new_state, r, done, buy, sell, profit = env.step(a)

                total_reward += r
                if profit != 0:
                    trades += 1
                    total_profit += profit
                    if total_profit > total_profitMax:
                        total_profitMax = total_profit
                        total_profitMin = total_profit
                    if total_profit < total_profitMin:
                        total_profitMin = total_profit
                        try:
                            if total_profitMax != 0 and max_drop < (
                                    total_profitMax -
                                    total_profitMin) / total_profitMax:
                                max_drop = (total_profitMax -
                                            total_profitMin) / total_profitMax
                        except:
                            max_drop = 0

                if profit > 0:
                    profitLst.append(profit)
                elif profit < 0:
                    lossLst.append(profit)

                step += 1
                if step % 1500 == 0:
                    print(
                        'maxProfit: {} maxLOSS: {} avgProfit: {:01.2f} avgLOSS: {:01.2f} maxdrop: {:.2%} Total profit: {}/{} TRADES: {}  '
                        .format(np.max(profitLst + [0]),
                                -np.min(lossLst + [0]),
                                np.mean(profitLst + [0]),
                                -np.mean(lossLst + [0]), max_drop,
                                total_profit, gross_profit, trades))

                    WritetoCsvFile("logFileDetail.csv", [
                        "train", args.trainf, args.history_win, args.stop,
                        args.usevol, args.dueling, args.traineval,
                        args.allprices, args.allprices2, args.allprices3,
                        args.ma1, args.ma2, args.madifference, args.hidema,
                        args.candlenum, args.hidden_dim,
                        np.max(profitLst + [0]), -np.min(lossLst + [0]),
                        np.mean(profitLst + [0]), -np.mean(lossLst + [0]),
                        max_drop, total_profit, gross_profit, total_reward,
                        trades, epoch
                    ])
                #done = True if step == len(env.data) - 3 else False
                ######################################################
                # Memorize for experience replay
                self.memorize(old_state, a, r, done, new_state)
                # Update current state
                old_state = new_state
                cumul_reward += r
                time += 1
                # Train DDQN and transfer weights to target network
                if (self.buffer.size() > args.batch_size):
                    self.train_agent(args.batch_size)
                    self.agent.transfer_weights()

            gross_profit += total_profit
            # Gather stats every episode for plotting
            if (args.gather_stats):
                mean, stdev = gather_stats(self, env)
                results.append([e, mean, stdev])

            # Export results for Tensorboard
            score = tfSummary('score', cumul_reward)
            l_profit = tfSummary('profit', total_profit)
            l_aprofit = tfSummary('average profit', np.mean(profitLst))
            l_aloss = tfSummary('l_aloss', -np.mean(lossLst))
            l_trades = tfSummary('l_trades', trades)
            np.mean(profitLst), -np.mean(lossLst)
            summary_writer.add_summary(score, global_step=e)
            summary_writer.add_summary(l_profit, global_step=e)
            summary_writer.add_summary(l_aprofit, global_step=e)
            summary_writer.add_summary(l_aloss, global_step=e)
            summary_writer.add_summary(l_trades, global_step=e)
            summary_writer.flush()

            # Display score
            tqdm_e.set_description("Score: " + str(cumul_reward))
            tqdm_e.refresh()
            self.agent.saveModel("./models/model_ep", "")
            results = [
                np.max(profitLst + [0]), -np.min(lossLst + [0]),
                np.mean(profitLst + [0]), -np.mean(lossLst + [0]),
                len(profitLst),
                len(lossLst), max_drop, total_profit, total_reward, trades
            ]

            WritetoCsvFile("logFile_1.csv", [
                "train", args.trainf, args.history_win, args.stop, args.usevol,
                args.dueling, args.traineval, args.allprices, args.allprices2,
                args.allprices3, args.ma1, args.ma2, args.madifference,
                args.hidema, args.candlenum, args.hidden_dim
            ] + results + [epoch])
            if envtest:  # Если задано окружение для тестирования то тестируем каждую эпоху
                newargs = args
                newargs.traineval = False
                self.evaluate(envtest,
                              newargs,
                              summary_writer,
                              model=None,
                              epoch=epoch)

            epoch += 1
        return results

    def memorize(self, state, action, reward, done, new_state):
        """ Store experience in memory buffer
        """

        if (self.with_per):
            q_val = self.agent.predict(state)
            q_val_t = self.agent.target_predict(new_state)
            next_best_action = np.argmax(q_val)
            new_val = reward + self.gamma * q_val_t[0, next_best_action]
            td_error = abs(new_val - q_val)[0]
        else:
            td_error = 0
        self.buffer.memorize(state, action, reward, done, new_state, td_error)

    def evaluate(self, env, args, summary_writer, model, epoch=0):
        """ Evaluate            """
        results = []
        if model:
            self.agent.loadModel_versoin(model, "")
        done = False
        old_state = env.reset()
        ##########################################
        total_reward = 0
        total_profit = 0
        total_loss = 0
        total_profitMax = 0
        total_profitMin = 0
        max_drop = 0
        profitLst = []
        lossLst = []
        step = 0
        trades = 0
        #####################################3####
        while not done:
            # if args.render: env.render()
            # Actor picks an action (following the policy)
            a = self.policy_action(old_state)
            # Retrieve new state, reward, and whether the state is terminal
            new_state, r, done, buy, sell, profit = env.step(a)

            #######################################################
            total_reward += r
            if profit != 0:
                trades += 1
                total_profit += profit
                if total_profit > total_profitMax:
                    total_profitMax = total_profit
                    total_profitMin = total_profit
                if total_profit < total_profitMin:
                    total_profitMin = total_profit
                    try:
                        if total_profitMax != 0 and max_drop < (
                                total_profitMax -
                                total_profitMin) / total_profitMax:
                            max_drop = (total_profitMax -
                                        total_profitMin) / total_profitMax
                    except:
                        max_drop = 0
            if profit > 0:
                profitLst.append(profit)

            elif profit < 0:
                lossLst.append(profit)
            step += 1
            if step % 1500 == 0:
                print(
                    'maxProfit: {} maxLOSS: {} avgProfit: {:01.2f} avgLOSS: {:01.2f} maxdrop: {:.2%} Total profit: {}  Total reward: {}  TRADES: {}  '
                    .format(np.max(profitLst + [0]), -np.min(lossLst + [0]),
                            np.mean(profitLst + [0]), -np.mean(lossLst + [0]),
                            max_drop, total_profit, total_reward, trades))
                WritetoCsvFile("logFileDetail.csv", [
                    "eval", args.trainf, args.history_win, args.stop,
                    args.usevol, args.dueling, args.traineval, args.allprices,
                    args.allprices2, args.allprices3, args.ma1, args.ma2,
                    args.madifference, args.hidema, args.candlenum,
                    args.hidden_dim,
                    np.max(profitLst + [0]), -np.min(lossLst + [0]),
                    np.mean(profitLst + [0]), -np.mean(lossLst + [0]),
                    max_drop, total_profit, total_profit, total_reward, trades,
                    epoch
                ])
            #done = True if step == len(env.data) - 2 else False
            ######################################################
            # Memorize for experience replay
            if args.traineval:
                self.memorize(old_state, a, r, done, new_state)
                # Train DDQN and transfer weights to target network
                if (self.buffer.size() > args.batch_size):
                    self.train_agent(args.batch_size)
                    self.agent.transfer_weights()
            # Update current state
            old_state = new_state
        print(
            'maxProfit: {} maxLOSS: {} avgProfit: {:01.2f} avgLOSS: {:01.2f} maxdrop: {:.2%} Total profit: {} Total reward: {} TRADES: {}  '
            .format(np.max(profitLst + [0]), -np.min(lossLst + [0]),
                    np.mean(profitLst + [0]), -np.mean(lossLst + [0]),
                    max_drop, total_profit, total_reward, trades))
        results = [
            np.max(profitLst + [0]), -np.min(lossLst + [0]),
            np.mean(profitLst + [0]), -np.mean(lossLst + [0]),
            len(profitLst),
            len(lossLst), max_drop, total_profit, total_reward, trades
        ]
        WritetoCsvFile("logFile_1.csv", [
            "eval", args.trainf, args.history_win, args.stop, args.usevol,
            args.dueling, args.traineval, args.allprices, args.allprices2,
            args.allprices3, args.ma1, args.ma2, args.madifference,
            args.hidema, args.candlenum, args.hidden_dim
        ] + results + [epoch])
        return results
Esempio n. 5
0
class DDQN:
    """ Deep Q-Learning Main Algorithm
    """
    def __init__(self, action_dim, state_dim, args, input_size, hp,
                 export_path, env):
        """ Initialization
        """

        self.export_path = export_path

        # Environment and DDQN parameters
        self.with_per = args.with_per
        self.action_dim = action_dim
        self.state_dim = (args.consecutive_frames, ) + state_dim
        #
        self.lr = hp["lr"]
        self.gamma = 0.99
        # Exploration parameters for epsilon greedy strategy
        self.explore_start = self.epsilon = 1.0  # exploration probability at start
        self.explore_stop = 0.1  # minimum exploration probability
        self.decay_rate = 0.000001  # exponential decay rate for exploration prob

        self.buffer_size = 20000
        self.input_size = input_size

        self.video_dir = args.video_dir

        # Create actor and critic networks
        self.agent = Agent(self.state_dim, action_dim, self.lr, args.dueling,
                           input_size, args.load)
        # Memory Buffer for Experience Replay
        self.buffer = MemoryBuffer(self.buffer_size, args.with_per)

        try:
            # Init buffer
            threads = 16
            p = Pool(processes=threads)
            while self.buffer.size() < self.buffer_size:

                # Set up threaded frame accumulation
                buffers = p.map_async(init_buffer, [env] * threads)
                datas = buffers.get()

                # Record in global memory
                for data in datas:
                    for entry in data:
                        self.memorize(*entry)

                # Mitigate memory leak
                del buffers
                del datas

                print("Buffer size: {}".format(self.buffer.size()))

        except KeyboardInterrupt:
            p.close()
            p.join()
        p.close()
        p.join()

        # Train on pure randomness for a while
        tqdm_e = tqdm(range(2000), desc='Score', leave=True, unit=" episodes")
        for e in tqdm_e:
            record = False
            if e % 100 == 0: record = True
            self.train_agent(args.batch_size, record)

            if e % 1000 == 0:
                self.agent.transfer_weights()

            # Display score
            tqdm_e.refresh()

    def policy_action(self, s):
        """ Apply an espilon-greedy policy to pick next action
        """
        if np.random.random() <= self.epsilon:
            return np.random.randint(self.action_dim)
        else:
            a_vect = self.agent.predict(s)[0]
            return np.argmax(a_vect)

    def train_agent(self, batch_size, record=False):
        """ Train Q-network on batch sampled from the buffer
        """
        # Sample experience from memory buffer (optionally with PER)
        s, a, r, d, new_s, idx = self.buffer.sample_batch(batch_size)

        # Apply Bellman Equation on batch samples to train our DDQN
        q = self.agent.predict(s)
        next_q = self.agent.predict(new_s)
        q_targ = self.agent.target_predict(new_s)

        for i in range(s.shape[0]):
            old_q = q[i, a[i]]
            if d[i]:
                q[i, a[i]] = r[i]
            else:
                next_best_action = np.argmax(next_q[i, :])
                q[i, a[i]] = r[i] + self.gamma * q_targ[i, next_best_action]

        # Train on batch
        self.agent.fit(s, q, record=record)

    def train(self, env, args):
        """ Main DDQN Training Algorithm
        """

        results = []
        tqdm_e = tqdm(range(args.nb_episodes),
                      desc='Score',
                      leave=True,
                      unit=" episodes")

        decay_step = 0
        self.t = 0
        for e in tqdm_e:
            # Reset episode
            time, cumul_reward, cumul_r_r, done = 0, 0, 0, False
            position = deque(maxlen=50)
            position.append(0)
            old_state = env.reset()

            while not done:
                decay_step += 1
                env.render()
                # Actor picks an action (following the policy)
                a = self.policy_action(old_state)

                # Retrieve new state, reward, and whether the state is terminal
                new_state, r, done, _ = env.step(a)

                # Memorize for experience replay
                if r == 0: r_r = 0
                elif r > 0: r_r = 1
                else: r_r = -1

                # Reward for not staying in place
                if a == 2: position.append(position[-1] + 1)
                if a == 3: position.append(position[-1] - 1)
                r_w = abs(max(position) - min(position)) / 10000
                r_r += r_w

                self.memorize(old_state, a, r_r, done, new_state)

                # Update current state
                old_state = new_state
                cumul_reward += r
                cumul_r_r += r_r
                time += 1

                self.epsilon = self.explore_stop + (
                    self.explore_start - self.explore_stop) * np.exp(
                        -self.decay_rate * decay_step)

                # Train DDQN
                if (self.buffer.size() >
                        args.batch_size) and self.t % 2000 == 0:
                    self.train_agent(args.batch_size)
                self.t += 1

                if self.t % 10000 == 0:
                    self.agent.transfer_weights()

            if e % 50 == 0:
                self.agent.save("./model.h5")
                wandb.save("./model.h5")

            if e % 100 == 0:
                # wandb logging
                evaluate(cumul_reward, self.epsilon)
                self.train_agent(args.batch_size, record=True)

            # Display score
            text = "Score: {}, Fake Score: {:.2f}".format(
                str(cumul_reward), cumul_r_r)
            tqdm_e.set_description(text)
            tqdm_e.refresh()

            # render gameplay video
            if (e % 50 == 0):
                mp4list = glob.glob('video/' + self.video_dir + '/*.mp4')
                if len(mp4list) > 0:
                    mp4 = mp4list[-1]
                    video = io.open(mp4, 'r+b').read()
                    encoded = base64.b64encode(video)
                    # log gameplay video in wandb
                    wandb.log(
                        {"gameplays": wandb.Video(mp4, fps=4, format="gif")})

        return results

    def memorize(self, state, action, reward, done, new_state):
        """ Store experience in memory buffer
        """

        if (self.with_per):
            q_val = self.agent.predict(state)
            q_val_t = self.agent.target_predict(new_state)
            next_best_action = np.argmax(q_val)
            new_val = reward + self.gamma * q_val_t[0, next_best_action]
            td_error = abs(new_val - q_val)[0]
        else:
            td_error = 0
        self.buffer.memorize(state, action, reward, done, new_state, td_error)

    def save(self, path):
        self.agent.save(path)

    def load_weights(self, path):
        self.agent.load_weights(path)
class DDPG:
    """ Deep Deterministic Policy Gradient (DDPG) Helper Class
    """

    def __init__(self, act_dim, env_dim, act_range, act_min, k, algo_clustering, episode_length, buffer_size=20000, gamma=0.99, lr=0.00005, tau=0.001, add_noise=True):
        """ Initialization
        """
        # Environment and A2C parameters
        self.act_dim = act_dim
        self.act_range = act_range
        self.act_min = act_min
        self.env_dim = (k,) + env_dim
        self.gamma = gamma
        self.lr = lr
        self.add_noise = add_noise
        # Create actor and critic networks
        self.actor = Actor(self.env_dim, act_dim, act_range, 0.1 * lr, tau)
        self.critic = Critic(self.env_dim, act_dim, lr, tau)
        self.buffer = MemoryBuffer(buffer_size)
        self.episode_length = episode_length

        # The ddqn algorithm for clustering
        self.ddqn_clustering = algo_clustering

    def policy_action(self, s):
        """ Use the actor to predict value
        """
        return self.actor.predict(s)[0]

    def bellman(self, rewards, q_values, dones):
        """ Use the Bellman Equation to compute the critic target
        """
        critic_target = np.asarray(q_values)
        for i in range(q_values.shape[0]):
            if dones[i]:
                critic_target[i] = rewards[i]
            else:
                critic_target[i] = rewards[i] + self.gamma * q_values[i]
        return critic_target

    def memorize(self, state, action, reward, done, new_state):
        """ Store experience in memory buffer
        """
        self.buffer.memorize(state, action, reward, done, new_state)

    def sample_batch(self, batch_size):
        return self.buffer.sample_batch(batch_size)

    def update_models(self, states, actions, critic_target):
        """ Update actor and critic networks from sampled experience
        """
        # Train critic
        self.critic.train_on_batch(states, actions, critic_target)
        # Q-Value Gradients under Current Policy
        actions = self.actor.model.predict(states)
        grads = self.critic.gradients(states, actions)
        # Train actor
        self.actor.train(states, actions, np.array(grads).reshape((-1, self.act_dim)))
        # Transfer weights to target networks at rate Tau
        self.actor.transfer_weights()
        self.critic.transfer_weights()

    def train(self, env, args, summary_writer):
        results = []

        # First, gather experience
        tqdm_e = tqdm(range(args.nb_episodes), desc='Score', leave=True, unit=" episodes")
        for e in tqdm_e:

            # Reset episode
            time, cumul_reward, done = 0, 0, False
            old_state = env.reset()
            actions, states, rewards = [], [], []
            if self.add_noise:
                noise = OrnsteinUhlenbeckProcess(size=self.act_dim)

            while (not done) and (time < self.episode_length):
                if args.render:
                    env.render()
                # Actor picks an action (following the deterministic policy)
                a = self.policy_action(old_state)
                a_clustering = self.ddqn_clustering.policy_action(old_state)
                # Clip continuous values to be valid w.r.t. environment
                if self.add_noise:
                    a = np.clip(a + noise.generate(time), self.act_min, self.act_range)
                else:
                    a = np.clip(a, self.act_min, self.act_range)
                # Retrieve new state, reward, and whether the state is terminal
                a_and_c = {'a': a, 'c': a_clustering}
                new_state, r, done, _ = env.step(a_and_c)
                # Add outputs to memory buffer
                self.memorize(old_state, a, r, done, new_state)

                # add outputs to the DDQN memory buffer
                self.ddqn_clustering.memorize(old_state, a_clustering, r, done, new_state)
                # Train DDQN and transfer weights to target network
                if(self.buffer.size() > args.batch_size):
                    self.ddqn_clustering.train_agent(args.batch_size)
                    self.ddqn_clustering.agent.transfer_weights()

                # Sample experience from buffer
                states, actions, rewards, dones, new_states, _ = self.sample_batch(args.batch_size)
                # Predict target q-values using target networks
                q_values = self.critic.target_predict([new_states, self.actor.target_predict(new_states)])
                # Compute critic target
                critic_target = self.bellman(rewards, q_values, dones)

                # Train both networks on sampled batch, update target networks
                self.update_models(states, actions, critic_target)

                # Update current state
                old_state = new_state
                cumul_reward += r
                time += 1

            cumul_reward /= self.episode_length

            # Gather stats every episode for plotting
            if(args.gather_stats):
                mean, stdev = gather_stats(self, env)
                results.append([e, mean, stdev])

            # Export results for Tensorboard
            score = tfSummary('score', cumul_reward)
            summary_writer.add_summary(score, global_step=e)
            summary_writer.flush()
            # Display score
            tqdm_e.set_description("Score: " + str(cumul_reward))
            tqdm_e.refresh()

        return results

    def save_weights(self, path):
        path += '_LR_{}'.format(self.lr)
        self.actor.save(path)
        self.critic.save(path)

    def load_weights(self, path_actor, path_critic, path_ddqn):
        self.critic.load_weights(path_critic)
        self.actor.load_weights(path_actor)
        self.ddqn_clustering.load_weights(path_ddqn)
Esempio n. 7
0
class td3Agent():
    """Twin Delayed Deep Deterministic Policy Gradient(TD3) Agent
	"""
    def __init__(self,
                 env_,
                 is_discrete=False,
                 batch_size=100,
                 w_per=True,
                 update_delay=2):
        # gym environments
        self.env = env_
        self.discrete = is_discrete
        self.obs_dim = env_.observation_space.shape[0]
        self.act_dim = env_.action_space.n if is_discrete else env_.action_space.shape[
            0]

        self.action_bound = (env_.action_space.high - env_.action_space.low
                             ) / 2 if not is_discrete else 1.
        self.action_shift = (env_.action_space.high + env_.action_space.low
                             ) / 2 if not is_discrete else 0.

        # initialize actor & critic and its targets
        self.discount_factor = 0.99
        self.actor = ActorNet(self.obs_dim,
                              self.act_dim,
                              self.action_bound,
                              lr_=3e-4,
                              tau_=5e-3)
        self.critic = CriticNet(self.obs_dim,
                                self.act_dim,
                                lr_=3e-4,
                                tau_=5e-3,
                                discount_factor=self.discount_factor)

        # Experience Buffer
        self.buffer = MemoryBuffer(BUFFER_SIZE, with_per=w_per)
        self.with_per = w_per
        self.batch_size = batch_size
        # OU-Noise-Process
        self.noise = OrnsteinUhlenbeckProcess(size=self.act_dim)

        # for Delayed Policy Update
        self._update_step = 0
        self._target_update_interval = update_delay

    ###################################################
    # Network Related
    ###################################################
    def make_action(self, obs, t, noise=True):
        """ predict next action from Actor's Policy
		"""
        action_ = self.actor.predict(obs)[0]
        sigma = 0.1  # std of gaussian
        a = np.clip(
            action_ +
            np.random.normal(0, self.action_bound * sigma) if noise else 0,
            -self.action_bound, self.action_bound)
        #a = np.clip(action_ + self.noise.generate(t) if noise else 0, -self.action_bound, self.action_bound)
        return a

    def make_target_action(self, obs, noise=True):
        """ predict next action from Actor's Target Policy
		"""
        action_ = self.actor.target_predict(obs)
        sigma = 0.2
        #return action_
        cliped_noise = np.clip(np.random.normal(0, self.action_bound * sigma),
                               -self.action_bound * 0.5,
                               self.action_bound * 0.5)
        a = np.clip(action_ + cliped_noise if noise else 0, -self.action_bound,
                    self.action_bound)
        return a

    def update_networks(self, obs, acts, critic_target):
        """ Train actor & critic from sampled experience
		"""
        # update critic
        self.critic.train(obs, acts, critic_target)
        if self._update_step % self._target_update_interval == 0:
            # update actor
            self.actor.train(obs, self.critic.network_1)

            # update target networks
            self.actor.target_update()
            self.critic.target_update()
        self._update_step = self._update_step + 1

    def train(self):
        if self.with_per and (self.buffer.size() <= self.batch_size): return

        # sample from buffer
        states, actions, rewards, dones, new_states, idx = self.sample_batch(
            self.batch_size)

        # get target q-value using target network
        new_actions = self.make_target_action(new_states)
        q1_vals = self.critic.target_network_1.predict(
            [new_states, new_actions])
        q2_vals = self.critic.target_network_2.predict(
            [new_states, new_actions])

        # bellman iteration for target critic value
        q_vals = np.min(np.vstack([q1_vals.transpose(),
                                   q2_vals.transpose()]),
                        axis=0)
        critic_target = np.asarray(q_vals)
        # print(np.vstack([q1_vals.transpose(),q2_vals.transpose()]))
        # print(q_vals)
        for i in range(q1_vals.shape[0]):
            if dones[i]:
                critic_target[i] = rewards[i]
            else:
                critic_target[
                    i] = self.discount_factor * q_vals[i] + rewards[i]

            if self.with_per:
                self.buffer.update(idx[i], abs(q_vals[i] - critic_target[i]))

        # train(or update) the actor & critic and target networks
        self.update_networks(states, actions, critic_target)

    ####################################################
    # Buffer Related
    ####################################################

    def memorize(self, obs, act, reward, done, new_obs):
        """store experience in the buffer
		"""
        if self.with_per:
            # not implemented for td3, yet.
            q_val = self.critic.network(
                [np.expand_dims(obs, axis=0),
                 self.actor.predict(obs)])
            next_action = self.actor.target_network.predict(
                np.expand_dims(new_obs, axis=0))
            q_val_t = self.critic.target_network.predict(
                [np.expand_dims(new_obs, axis=0), next_action])
            new_val = reward + self.discount_factor * q_val_t
            td_error = abs(new_val - q_val)[0]
        else:
            td_error = 0

        self.buffer.memorize(obs, act, reward, done, new_obs, td_error)

    def sample_batch(self, batch_size):
        """ Sampling from the batch
		"""
        return self.buffer.sample_batch(batch_size)

    ###################################################
    # Save & Load Networks
    ###################################################
    def save_weights(self, path):
        """ Agent's Weights Saver
		"""
        self.actor.save_network(path)
        self.critic.save_network(path)

    def load_weights(self, pretrained):
        """ Agent's Weights Loader
		"""
        self.actor.load_network(pretrained)
        self.critic.load_network(pretrained)