Exemple #1
0
    def transcate_DDPG(self):
        BATCH_SIZE = 32
        total_steps = 0  # 记录步数,一天是一步
        profit_list = []  # 记录每局总收益
        profitAdvanced_list = []
        actions = 2  # 行动个数
        s_dim = 87
        a_dim = 1
        brain = DDPG(
            a_dim=a_dim,
            s_dim=s_dim,
            a_bound=1.,
            LR_A=0.001,
            LR_C=0.001,
            GAMMA=.99,
            TAU=0.01,
            # replacement=REPLACEMENT,
        )
        gameNum = 0  #记录游戏轮数
        ex_steps = 500  #探索衰减的轮数
        epsilon = self.epsilon
        last_remainder = 0
        reward_list = [0]  #存储每次的收益,来计算baseline
        Loss_list = []  #存储训练过程中的损失值
        wait_list = []  #记录N轮游戏分别等待天数
        gameSplit = 5000  #每多少轮游戏画图
        while total_steps < 60000:
            # 初始化游戏
            # routeId = random.randrange(0, 49, 1)
            routeId = 21
            self.routeline = self.allRoute[routeId]
            # print(self.routeline)
            env = Env(self.routeline)
            gameNum += 1
            # state = env.getState()  # 以state[0]、state[1]方式访问
            today = env.getToday()

            terminal = False
            order_accepted = False
            isExploration = False
            create_date = 1
            end_date = 0
            stay_num = 0
            # 一局游戏
            # print("GAME#:",gameNum)
            baseline = 0
            tao_prob = []
            tao_reward = []
            wait_day = []  #记录一局游戏等待哪些天

            while today < self.routeline[-1] and terminal == False:
                # 有新订单产生 (当订单数已满10个时,此处不会收到新订单)
                if order_accepted == False:
                    self.orderSelect(self.routeline, 60)
                    # print(self.order)
                    env.setOrder(self.order)
                    order_accepted = True
                # 遍历self.orders(即state[0])字典,对每一个订单操作
                state = env.getState()

                # 当前状态
                state_tf = np.mat(state)
                # print(state_tf,len(state_tf))
                # 由神经网络选择行动
                if random.random() < epsilon and isExploration == False:
                    isExploration = True
                    # end_date = random.randrange(env.getTodayIndex(),87,1)
                    end_date = 60

                if isExploration:
                    if env.getTodayIndex() == end_date:
                        action_model = 1
                        if ex_steps > 0:
                            ex_steps -= 1
                    else:
                        action_model = 0
                else:
                    #action from learning
                    action_model = brain.choose_action(state_tf)
                    # print(action_model)

                wait_day.append(env.getTodayIndex())
                # 订单字典 历史曲线 reward

                reward = env.getReward(action_model)
                tao_reward.append(reward)
                # 订单完成或者到最后一天
                terminal = env.isTerminal(action_model)
                state_ = env.getNextState(action_model)
                if len(state_) == 1:
                    state_ = copy.deepcopy(state)
                brain.store_transition(state, action_model, reward, state_)
                # profitAdvanced_list.append(td_error[0][0])

                if brain.pointer > brain.MEMORY_CAPACITY:
                    # print(b_s_)
                    brain.learn()

                total_steps += 1
                if terminal:
                    # wait_list.append(wait_day[-1])
                    # loss = brain.learn()
                    # Loss_list.append(loss)
                    break

                # step 过一天加一
                env.nextStep()

            # 一局的总收益
            epsilon = self.epsilon * (ex_steps / 500)
            print("epsilon:", epsilon)
            print("TD_Error:", baseline)
            profit = env.getTotalReward()
            profit_list.append(profit)
            print("total_steps:", total_steps)
            print("profit_list", profit_list)
            print("profit:", profit, "profitAvg:", np.mean(profit_list))
            print("action-prob:", tao_prob)
            print("Reward:", tao_reward)
            print("wait_day:", wait_day)
            self.writeHistory('./picture/history.txt', epsilon, baseline,
                              total_steps, profit_list, profit, tao_prob,
                              tao_reward, wait_day, gameNum)

            print("########################" + str(gameNum) +
                  "###########################")
            if len(profit_list) >= gameSplit:
                plt.figure()
                plt.plot(profit_list, 'r-')
                plt.savefig('./picture/' + str(gameNum) +
                            'liner_profit_PG.jpg')
                plt.figure()
                plt.scatter(np.arange(gameSplit), profit_list)
                plt.savefig('./picture/' + str(gameNum) +
                            'scatter_profit_PG.jpg')
                plt.figure()
                plt.plot(profitAdvanced_list, 'g-')
                plt.savefig('./picture/' + str(gameNum) +
                            'liner_advanced_PG.jpg')
                plt.figure()
                plt.plot(Loss_list, 'y-')
                plt.savefig('./picture/' + str(gameNum) + 'liner_loss_PG.jpg')
                plt.figure()
                plt.scatter(np.arange(gameSplit), wait_list, c='r')
                plt.savefig('./picture/' + str(gameNum) +
                            'scatter_waitDay_PG.jpg')
            if len(profit_list) >= 500:
                profit_list.clear()
                wait_list.clear()
Exemple #2
0
    def transcate_PG(self):
        total_steps = 0  # 记录步数,一天是一步
        profit_list = []  # 记录每局总收益
        profitAdvanced_list = []
        actions = 2  # 行动个数
        brain = PolicyGradient(
            n_actions=2,
            n_features=87,
            learning_rate=0.1,
            reward_decay=1,
        )
        gameNum = 0  #记录游戏轮数
        ex_steps = 500  #探索衰减的轮数
        epsilon = self.epsilon
        last_remainder = 0
        reward_list = [0]  #存储每次的收益,来计算baseline
        Loss_list = []  #存储训练过程中的损失值
        wait_list = []  #记录等待天数
        gameSplit = 500  #每多少轮游戏画图
        while total_steps < 60000:
            # 初始化游戏
            # routeId = random.randrange(0, 49, 1)
            routeId = 21
            self.routeline = self.allRoute[routeId]
            # print(self.routeline)
            env = Env(self.routeline)
            gameNum += 1
            # state = env.getState()  # 以state[0]、state[1]方式访问
            today = env.getToday()

            terminal = False
            order_accepted = False
            isExploration = False
            create_date = 1
            end_date = 0
            stay_num = 0
            # 一局游戏
            # print("GAME#:",gameNum)
            baseline = 0
            tao_prob = []
            tao_reward = 0
            wait_day = []

            while today < self.routeline[-1] and terminal == False:
                # 有新订单产生 (当订单数已满10个时,此处不会收到新订单)
                if order_accepted == False:
                    self.orderSelect(self.routeline, 60)
                    # print(self.order)
                    env.setOrder(self.order)
                    order_accepted = True
                    # print(self.order[1])
                # 遍历self.orders(即state[0])字典,对每一个订单操作
                state = env.getState()

                # 当前状态
                state_tf = np.mat(state)
                # print(state_tf,len(state_tf))
                # 由神经网络选择行动
                if random.random() < epsilon and isExploration == False:
                    isExploration = True
                    end_date = random.randrange(env.getTodayIndex(), 87, 1)
                    # end_date = 60

                if isExploration:
                    if env.getTodayIndex() == end_date:
                        action_model = 1
                        if ex_steps > 0:
                            ex_steps -= 1
                    else:
                        action_model = 0
                else:
                    #action from learning
                    action_model, p = brain.choose_action(
                        state_tf, env.getTodayIndex())
                    tao_prob.append(p)
                if action_model == 0:
                    action_finishOrder = [1, 0]
                else:
                    action_finishOrder = [0, 1]

                # 订单字典 历史曲线 reward

                reward = env.getReward(action_model)

                # 订单完成或者到最后一天
                terminal = env.isTerminal(action_model)
                if terminal:
                    tmp = reward
                    baseline = np.mean(reward_list)
                    profitAdvanced_list.append(baseline)
                    reward -= baseline
                    reward_list.append(tmp)
                    # print("END_REWARD:",reward,",reward_list:",reward_list)
                # 保存记录到记忆库
                # print("this is store arg:",state_tf,";", action_model,";", reward,";", env.getTodayIndex())
                brain.store_transition(state_tf, action_model, reward,
                                       env.getTodayIndex())
                # print(action_model)

                total_steps += 1
                if terminal:
                    loss, wait_day, tao_reward = brain.learn()
                    Loss_list.append(loss)
                    wait_list.append(wait_day[-1])
                    break

                # step 过一天加一
                env.nextStep()

            # 一局的总收益
            epsilon = self.epsilon * (ex_steps / 500)
            print("epsilon:", epsilon)
            print("Baseline:", baseline)
            profit = env.getTotalReward()
            profit_list.append(profit)
            print("total_steps:", total_steps)
            print("profit_list", profit_list)
            print("profit:", profit, "profitAvg:", np.mean(profit_list))
            print("action-prob:", tao_prob)
            print("Reward:", tao_reward)
            print("wait_day:", wait_day)
            self.writeHistory('./picture/history.txt', epsilon, baseline,
                              total_steps, profit_list, profit, tao_prob,
                              tao_reward, wait_day, gameNum)

            print("########################" + str(gameNum) +
                  "###########################")
            if len(profit_list) >= gameSplit:
                plt.figure()
                plt.plot(profit_list, 'r-')
                plt.savefig('./picture/' + str(gameNum) +
                            'liner_profit_PG.jpg')
                plt.figure()
                plt.scatter(np.arange(gameSplit), profit_list)
                plt.savefig('./picture/' + str(gameNum) +
                            'scatter_profit_PG.jpg')
                plt.figure()
                plt.plot(profitAdvanced_list, 'g-')
                plt.savefig('./picture/' + str(gameNum) +
                            'liner_advanced_PG.jpg')
                plt.figure()
                plt.plot(Loss_list, 'y-')
                plt.savefig('./picture/' + str(gameNum) + 'liner_loss_PG.jpg')
                plt.figure()
                plt.scatter(np.arange(gameSplit), wait_list, c='r')
                plt.savefig('./picture/' + str(gameNum) +
                            'scatter_waitDay_PG.jpg')
                profit_list.clear()
                wait_list.clear()