Exemple #1
0
def main():

    p1 = Agent()
    p2 = Agent()

    e = Env()

    p1.setSymbol(e.x)
    p2.setSymbol(e.o)

    p1.setV(e.initValues(p1.symbol))
    p2.setV(e.initValues(p2.symbol))

    for i in range(10000):
        if i % 1000 == 0:
            print "epoch: {}".format(i)
        play_game(p1, p2, Env())
    print "Training Complete"

    human = Human()
    human.set_symbol(e.o)
    p1.verbose = True

    while True:
        play_game(p1, human, Env(), draw=True)

        answer = raw_input("Pay again? [y,n] :")
        if answer and answer.lower()[0] == 'n':
            break
Exemple #2
0
def env():
    config = ConfigParser()
    config.read(["data.ini"])
    # 1.使用环境变量读取的方式,需要吧evn设置为 data.ini下的环境名,并将"env":"test_env"设置到环境变量中
    api_root_url = config[os.environ['env']]['api_root_url']
    # 2.直接读取
    # api_root_url = config['test_env']['api_root_url']
    yield Env(api_root_url=api_root_url,
              username=os.environ["username"],
              password=os.environ["password"])
Exemple #3
0
def testEnv():
    env = Env()
    channelThroughPut = 0  # fraction of time that packets are successfully delivered over the channel
    # i.e no collisions or idle time slots
    for iteration in range(config.Iterations):
        for t in range(config.TimeSlots):
            initialState = env.reset()
            for user in range(config.N):
                action = slottedAlohaProtocol()
                env.step(action=action, user=user)
                # each user changes the inner state of the environment where the environment uses the inner state
                # in order to keep track on the channels and the ACK signals for each user
            nextStateForEachUser, rewardForEachUser = env.getNextState()
            # if a reward is one that means that a packets was successfully delivered over the channel
            # the sum has a maximum of the number of channels -> config.K
            channelThroughPut = channelThroughPut + np.sum(rewardForEachUser)
    # measuring the expected value
    channelThroughPut = channelThroughPut / (config.Iterations *
                                             config.TimeSlots)
    print("Channel Utilization average {}".format(channelThroughPut))
    ToPlotX = range(config.Iterations * config.TimeSlots)
    ToPlotY = np.ones_like(ToPlotX) * channelThroughPut
    plot_graph(data=[ToPlotX, ToPlotY],
               filename="Aloha",
               title="Aloha",
               xlabel="Time slot",
               ylabel="Average channel utilization",
               legend="SlottedAloha")


#
#
# def testTimeEnv():
#     env = TimeDependentEnv()
#     channelThroughPut = 0  # fraction of time that packets are successfully delivered over the channel
#     # i.e no collisions or idle time slots
#     for iteration in range(config.Iterations):
#         TimeSPU = env.reset()
#         for t in range(config.TimeSlots):
#             env.resetTimeStep()
#             #  reset the internal state of the environment
#             #  which keep tracks of the users actions through out the time step
#             for user in range(config.N):
#                 action = slottedAlohaProtocol()
#                 env.step(action=action, user=user)
#                 # each user changes the inner state of the environment where the environment uses the inner state
#                 # in order to keep track on the channels and the ACK signals for each user
#             nextStateForEachUser, rewardForEachUser = env.tstep(timestep=t)
#             # if a reward is one that means that a packets was successfully delivered over the channel
#             # the sum has a maximum of the number of channels -> config.K
#             channelThroughPut = channelThroughPut + np.sum(rewardForEachUser)
#     # measuring the expected value
#     channelThroughPut = channelThroughPut / (config.Iterations * config.TimeSlots)
#     print("Channel Utilization average {}".format(channelThroughPut))
#     ToPlotX = range(config.Iterations * config.TimeSlots)
#     ToPlotY = np.ones_like(ToPlotX) * channelThroughPut
#     plot_graph(data=[ToPlotX, ToPlotY], filename="Aloha", title="Aloha",
#                xlabel="Time slot", ylabel="Average channel utilization", legend="SlottedAloha")
Exemple #4
0
def test_agents():
    result = np.zeros([5, 5])
    maps, trials_per_map = 10, 10
    ave_cost_1 = []
    ave_cost_2 = []
    ave_cost_3 = []
    for j in range(maps):
        en = Env(50)
        cost_1 = []
        cost_2 = []
        cost_3 = []
        # cost_4 = []
        for k in range(trials_per_map):
            print(f'map: {j + 1}/{maps}, play:  {k + 1}/{trials_per_map}')
            # en.set_target_on_type(i)
            en.set_target()
            en.print_target()

            agent_1 = Agent(en)
            searches_1, distance_1 = agent_1.run(1, False)
            sum_1 = searches_1 + distance_1
            cost_1.append(sum_1)

            agent_2 = Agent(en)
            searches_2, distance_2 = agent_2.run(2, False)
            sum_2 = searches_2 + distance_2
            cost_2.append(sum_2)

            agent_3 = Agent(en)
            searches_3, distance_3 = agent_3.run_improved(10000)
            sum_3 = searches_3 + distance_3
            cost_3.append(sum_3)

        ave_cost_1.append(sum(cost_1) / len(cost_1))
        ave_cost_2.append(sum(cost_2) / len(cost_2))
        ave_cost_3.append(sum(cost_3) / len(cost_3))
    result[0][1] = sum(ave_cost_1) / len(ave_cost_1)
    result[0][2] = sum(ave_cost_2) / len(ave_cost_2)
    result[0][3] = sum(ave_cost_3) / len(ave_cost_3)
    print(result)
Exemple #5
0
        self.model.load_weights('model.h5')
        print("The Model loaded")

    def update_epsilon(self):
        self.epsilon = max(self.epsilon_min, self.epsilon_decay * self.epsilon)

    def learning_rate_decay(self):
        lr = self.optimizer.lr.numpy()
        lr = max(self.lr_decay * lr, 0.001)
        self.optimizer.lr.assign(lr)


if __name__ == "__main__":

    # create environment
    env = Env()
    agent = DQNAgent()
    total_scores = np.empty(EPISODES)
    iteration = 0

    for e in range(EPISODES):

        state = env.reset()
        check_list = env.check_if_reward(state)
        goal = check_list['if_goal']  # done
        wumpus = check_list['if_wumpus']  # done

        losses = []
        score = 0  # done

        while (not goal) and (not wumpus):
Exemple #6
0
'''
Created on Sep 6, 2018

@author: dabrown
'''

from Cutie_Network import Cutie
from Environment import Env

if __name__ == '__main__':
    pass

    # first Init the Env
    env = Env()
    cutie = Cutie(env)
    cutie.train_nework(env)
Exemple #7
0
from tqdm import tqdm

from collections import namedtuple
StateVars = namedtuple('state_vars',
                       ['curr_state', 'prev_state_hash', 'reward'])
from Environment import State, Env

need = pd.read_csv('../fake_4region_trip_20170510.csv')
# dist=pd.read_csv('fake_4region_distance.csv')
# dist=dist.values
eps_num = 4
car_num = 1

env = Env(initial_region_state=[15, 15, 15, 15],
          capacity_each_step=10,
          max_episode=eps_num,
          car_count=car_num,
          need=need)
history = {i: dict() for i in range(8)}

for region in range(env.region_count):
    state = env.new_state()
    curr_state_hash = state.get_hash()
    state.out_stage()
    for car in range(env.car_num):
        for move in range(-env.capacity_each_step, env.capacity_each_step + 1):
            if state.check_feasible(region, car, move):

                new_state = state.step(region, car, move)
                new_state.in_stage()
                new_state_hash = new_state.get_hash()
Exemple #8
0
            resultLeaf = resultLeaf.left

        if not temp.isLeafNode():
            addNextStep(temp, resultLeaf)

    addNextStep(root, resultLeaf)

    totalReward = numpy.sum(rewards)

    print('The optimal way to traverse the tree with a total reward of ' +
          str(totalReward) + ' would be:')
    print(result)


if __name__ == '__main__':
    environment = Env()

    alpha = 0.1
    gamma = 0.6
    epsilon = 0.3

    number_of_episodes = 10000

    q_table = numpy.zeros([environment.normalizedtree.getNumberOfNodes(), 2])

    for i in range(1, number_of_episodes):
        state = environment.reset()
        visited_states = []

        penalties, reward = 0, 0
        done = False
def empezarPrueba():
    env = Env()
    env.width = 10
    env.height = 6
    env.posY = 6

    # QTable : contiene los valores Q para cada par (estado, acción)
    qtable = np.random.rand(env.stateCount, env.actionCount).tolist()

    epochs = 100  # Iteraciones que realizará el algoritmo
    gamma = 0.8  # Valor que reduce las rescompensas de forma exponencial según pasan las acciones
    epsilon = 0.1
    decay = 0.1

    print("Mapa inicial")
    # Generamos el mapa del problema en concreto
    env.crearMapaPrueba()

    for i in range(epochs):
        # Reinicia el algoritmo en cada inicio de las iteraciones
        state, reward, done = env.reset()
        steps = 0

        # Mientras que el algoritmo no llegue al estado final propuesto seguimos realizando acciones
        # y cambiando de estado
        while not done:
            print("epoch #", i + 1, "/", epochs)
            time.sleep(0.05)
            # Dibuja la nueva posición de la A en el mapa
            env.modificaMapa(i + 1)
            # Cuenta los pasos realizados hasta llegar el final
            steps += 1

            # Cuando el parametro epsilon es mayor al numero generado automaticamente se realiza una acción
            # aleatoria
            if np.random.uniform() < epsilon:
                action = env.randomAction()
            # Si no selecciona la acción dependiendo de el numero mayor que encontramos en la tabla
            else:
                action = qtable[state].index(max(qtable[state]))

            # Calcula el siguiente estado, la recompensa obtenido y si el algoritmo acabó la iteración
            next_state, reward, done = env.step(action)

            # Hace que la recompensa sea menor en cada iteración
            reward = reward - (steps * 0.3)

            # Actualizamos la q-tabla con los valores de la ecuación de bellman
            pos = reward * (gamma**steps / 2) + 0.9 * max(qtable[next_state])
            qtable[state][action] = pos

            # Si el algoritmo acaba dibuja la A en la posición final
            if done:
                env.fin(i + 1)

            # Muestra el mapa final por Tkinter
            if done and i + 1 == epochs:
                time.sleep(30)
                tk.mainloop()

            # Actualizamos el estado
            state = next_state
        # Epsilon se reduce en cada iteración para que el algoritmo haga menos elecciones aleatorias
        epsilon -= decay * epsilon

        print("\nDone in", steps, "steps".format(steps))
        time.sleep(0.8)
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Flatten, Input, merge
from keras.optimizers import Adam

from rl.agents import DDPGAgent
from rl.memory import SequentialMemory
from rl.random import OrnsteinUhlenbeckProcess
from Environment import Env, status, actions

import matplotlib.pyplot as plt

gym.undo_logger_setup()

ENV_NAME = 'SQ'
env = Env()
np.random.seed(123)
env.seed(123)
assert len(env.action_space.shape) == 1
nb_actions = env.action_space.shape[0]

actor_depth = 4
actor_width = 32

critic_depth = 6
critic_width = 64

# Next, we build a very simple model.
actor = Sequential()
actor.add(Flatten(input_shape=(1, ) + env.observation_space.shape))
for k in range(actor_depth):
Exemple #11
0
            # and do the model fit
            pass

    # save the model which is under training
    def save_model(self):
        self.model.save_weights('model.h5')

    # load the saved model
    def load_model(self):
        self.model.load_weights('model.h5')


if __name__ == "__main__":

    # create environment
    env = Env()
    agent = DQNAgent()

    # code

    for e in range(EPISODES):
        state = env.reset()

        # code

        while (not goal) and (not wumpus):
            if agent.render:
                env.render()

            # code
Exemple #12
0
from keras.optimizers import Adam

from rl.agents import DDPGAgent
from rl.memory import SequentialMemory
from rl.random import OrnsteinUhlenbeckProcess

from Environment import Env

ENV_NAME = 'Pendulum-v0'

gym.undo_logger_setup()


# Get the environment and extract the number of actions.
# env = gym.make(ENV_NAME)
env = Env()

np.random.seed(123)
env.seed(123)
assert len(env.action_space.shape) == 1
nb_actions = env.action_space.shape[0]

# Next, we build a very simple model.
actor = Sequential()
actor.add(Flatten(input_shape=(1,) + env.observation_space.shape))
actor.add(Dense(16))
actor.add(Activation('relu'))
actor.add(Dense(16))
actor.add(Activation('relu'))
actor.add(Dense(16))
actor.add(Activation('relu'))
Exemple #13
0
    utility_next = utility_matrix[next_state]
    delta = reward + gamma * utility_next - utility
    utility_matrix[state] += alpha * (delta)

    return utility_matrix, delta


def updateActor(state_action_matrix, state, action, delta):
    beta = 1
    state_action_matrix[state, action] += beta * delta

    return state_action_matrix


if __name__ == '__main__':
    environment = Env()

    alpha = 0.1
    gamma = 0.99
    epsilon = 0.01

    number_of_episodes = 10000

    state_action_pairs = numpy.full(
        (environment.normalizedtree.getNumberOfNodes(), 2), 0.5)
    utility_matrix = numpy.zeros(
        [environment.normalizedtree.getNumberOfNodes()])

    softmax = lambda vals: numpy.exp(vals - numpy.max(vals)) / numpy.sum(
        numpy.exp(vals - numpy.max(vals)))
Exemple #14
0
__author__ = 'Maira'
from Environment import Env
from Learning import Example, Effect
from OIlogic import AtomSet, Atom, Term

env = Env(4)
model = env.get_model()
state = env.generateState()
actions = env.getAllActions()
a, b, c, d, floor = Term('a'), Term('b'), Term('c'), Term('d'), Term('floor')
ex_1 = Example(state, actions[1], Effect(AtomSet([Atom("ON\\2", [a, b])]), AtomSet([Atom("ON\\2", [a, floor])])))
ex_2 = Example(state, actions[11], Effect(AtomSet([Atom("ON\\2", [c, d])]), AtomSet([Atom("ON\\2", [c, floor])])))

print(ex_1)
print(ex_2)

model.memorizeEx(ex_1)
model.memorizeEx(ex_2)

print(model)
rules = model.get_rules()
for r in rules:
    s = model.specialize(r)
    print(s)
    uex = model.getUncovEx(r)
    print('Examples:')
    for e in uex:
        print('\t'+str(e))
    c = model.contradicted(r)
    print(c)
exs = model.get_exMem()
Exemple #15
0
import random
from Environment import MetaEnvironment as Env

f = open("query.txt")

queryList = []
for line in f.readlines():
    line = line.strip()
    queryList.append(line)

env = Env(5)

for i in range(5):
    traceList = queryList[i * 10:(i + 1) * 10]
    state = env.state(traceList)
    print(state)
    moveList = [
        random.randint(0, env.server_num - 1) for _ in range(len(env.nodes))
    ]
    env.take_actions(moveList)
    print('Loc:', env.locality())
    print('Load:', env.load())
Exemple #16
0
import numpy as np
from Agent import Agent
from utils import plotLearning
from Environment import Env

if __name__ == '__main__':
    env = Env.reset()
    num_games = 250
    load_checkpoint = False

    agent = Agent(gamma=0.99,
                  epsilon=1.0,
                  lr=5e-4,
                  input_dims=[8],
                  n_actions=4,
                  mem_size=100000,
                  eps_min=0.01,
                  batch_size=64,
                  eps_dec=1e-3,
                  replace=100)

    if load_checkpoint:
        agent.load_models()

    filename = 'DDQN.png'
    scores = []
    eps_history = []
    n_steps = 0

    for i in range(num_games):
        done = False
Exemple #17
0
    def transcate_PG(self):
        total_steps = 0  # 记录步数,一天是一步
        profit_list = []  # 记录每局总收益
        profitAdvanced_list = []
        actions = 2  # 行动个数
        brain = PolicyGradient(
            n_actions=2,
            n_features=87,
            learning_rate=0.1,
            reward_decay=1,
        )
        gameNum = 0  #记录游戏轮数
        ex_steps = 500  #探索衰减的轮数
        epsilon = self.epsilon
        last_remainder = 0
        reward_list = [0]  #存储每次的收益,来计算baseline
        Loss_list = []  #存储训练过程中的损失值
        wait_list = []  #记录等待天数
        gameSplit = 500  #每多少轮游戏画图
        while total_steps < 60000:
            # 初始化游戏
            # routeId = random.randrange(0, 49, 1)
            routeId = 21
            self.routeline = self.allRoute[routeId]
            # print(self.routeline)
            env = Env(self.routeline)
            gameNum += 1
            # state = env.getState()  # 以state[0]、state[1]方式访问
            today = env.getToday()

            terminal = False
            order_accepted = False
            isExploration = False
            create_date = 1
            end_date = 0
            stay_num = 0
            # 一局游戏
            # print("GAME#:",gameNum)
            baseline = 0
            tao_prob = []
            tao_reward = 0
            wait_day = []

            while today < self.routeline[-1] and terminal == False:
                # 有新订单产生 (当订单数已满10个时,此处不会收到新订单)
                if order_accepted == False:
                    self.orderSelect(self.routeline, 60)
                    # print(self.order)
                    env.setOrder(self.order)
                    order_accepted = True
                    # print(self.order[1])
                # 遍历self.orders(即state[0])字典,对每一个订单操作
                state = env.getState()

                # 当前状态
                state_tf = np.mat(state)
                # print(state_tf,len(state_tf))
                # 由神经网络选择行动
                if random.random() < epsilon and isExploration == False:
                    isExploration = True
                    end_date = random.randrange(env.getTodayIndex(), 87, 1)
                    # end_date = 60

                if isExploration:
                    if env.getTodayIndex() == end_date:
                        action_model = 1
                        if ex_steps > 0:
                            ex_steps -= 1
                    else:
                        action_model = 0
                else:
                    #action from learning
                    action_model, p = brain.choose_action(
                        state_tf, env.getTodayIndex())
                    tao_prob.append(p)
                if action_model == 0:
                    action_finishOrder = [1, 0]
                else:
                    action_finishOrder = [0, 1]

                # 订单字典 历史曲线 reward

                reward = env.getReward(action_model)

                # 订单完成或者到最后一天
                terminal = env.isTerminal(action_model)
                if terminal:
                    tmp = reward
                    baseline = np.mean(reward_list)
                    profitAdvanced_list.append(baseline)
                    reward -= baseline
                    reward_list.append(tmp)
                    # print("END_REWARD:",reward,",reward_list:",reward_list)
                # 保存记录到记忆库
                # print("this is store arg:",state_tf,";", action_model,";", reward,";", env.getTodayIndex())
                brain.store_transition(state_tf, action_model, reward,
                                       env.getTodayIndex())
                # print(action_model)

                total_steps += 1
                if terminal:
                    loss, wait_day, tao_reward = brain.learn()
                    Loss_list.append(loss)
                    wait_list.append(wait_day[-1])
                    break

                # step 过一天加一
                env.nextStep()

            # 一局的总收益
            epsilon = self.epsilon * (ex_steps / 500)
            print("epsilon:", epsilon)
            print("Baseline:", baseline)
            profit = env.getTotalReward()
            profit_list.append(profit)
            print("total_steps:", total_steps)
            print("profit_list", profit_list)
            print("profit:", profit, "profitAvg:", np.mean(profit_list))
            print("action-prob:", tao_prob)
            print("Reward:", tao_reward)
            print("wait_day:", wait_day)
            self.writeHistory('./picture/history.txt', epsilon, baseline,
                              total_steps, profit_list, profit, tao_prob,
                              tao_reward, wait_day, gameNum)

            print("########################" + str(gameNum) +
                  "###########################")
            if len(profit_list) >= gameSplit:
                plt.figure()
                plt.plot(profit_list, 'r-')
                plt.savefig('./picture/' + str(gameNum) +
                            'liner_profit_PG.jpg')
                plt.figure()
                plt.scatter(np.arange(gameSplit), profit_list)
                plt.savefig('./picture/' + str(gameNum) +
                            'scatter_profit_PG.jpg')
                plt.figure()
                plt.plot(profitAdvanced_list, 'g-')
                plt.savefig('./picture/' + str(gameNum) +
                            'liner_advanced_PG.jpg')
                plt.figure()
                plt.plot(Loss_list, 'y-')
                plt.savefig('./picture/' + str(gameNum) + 'liner_loss_PG.jpg')
                plt.figure()
                plt.scatter(np.arange(gameSplit), wait_list, c='r')
                plt.savefig('./picture/' + str(gameNum) +
                            'scatter_waitDay_PG.jpg')
                profit_list.clear()
                wait_list.clear()
Exemple #18
0
        self.nb_episodes_random = 100
        self.nb_episodes = 100
        self.batch_size = 64
        self.mission_file = './maze.xml'
        self.memory_capacity = 100000
        self.gamma = 0.99
        self.learning_rate = 0.001
        self.epsilon = 0.2
        self.huber_loss_delta = 2.0
        self.update_target_frequency = 25
        self.max_epsilon = 0.7
        self.min_epsilon = 0.1
        self.decreasing_rate = -math.log(0.01) / self.nb_episodes


hps = HPS()
plt.plot(hps.min_epsilon + (hps.max_epsilon - hps.min_epsilon) *
         np.exp(-hps.decreasing_rate * np.arange(hps.nb_episodes)))

env = Env(hps.mission_file)
randomAgent = RandomAgent(hps)
play(env, hps, randomAgent, hps.nb_episodes_random, train=True)

Agent = DDQNPER_Agent(hps)
Agent.memory = randomAgent.memory = Agent.memory
##Agent.load()
##Agent.save()
#Agent.epsilon = 0.15
play(env, hps, Agent, hps.nb_episodes, train=True, save_victory=False)
#play(env, hps, Agent, 40, train=False, save_victory=True)
#plt.plot(Agent.losses)
Exemple #19
0
    def transcate_DDPG(self):
        BATCH_SIZE = 32
        total_steps = 0  # 记录步数,一天是一步
        profit_list = []  # 记录每局总收益
        profitAdvanced_list = []
        actions = 2  # 行动个数
        s_dim = 87
        a_dim = 1
        brain = DDPG(
            a_dim=a_dim,
            s_dim=s_dim,
            a_bound=1.,
            LR_A=0.001,
            LR_C=0.001,
            GAMMA=.99,
            TAU=0.01,
            # replacement=REPLACEMENT,
        )
        gameNum = 0  #记录游戏轮数
        ex_steps = 500  #探索衰减的轮数
        epsilon = self.epsilon
        last_remainder = 0
        reward_list = [0]  #存储每次的收益,来计算baseline
        Loss_list = []  #存储训练过程中的损失值
        wait_list = []  #记录N轮游戏分别等待天数
        gameSplit = 5000  #每多少轮游戏画图
        while total_steps < 60000:
            # 初始化游戏
            # routeId = random.randrange(0, 49, 1)
            routeId = 21
            self.routeline = self.allRoute[routeId]
            # print(self.routeline)
            env = Env(self.routeline)
            gameNum += 1
            # state = env.getState()  # 以state[0]、state[1]方式访问
            today = env.getToday()

            terminal = False
            order_accepted = False
            isExploration = False
            create_date = 1
            end_date = 0
            stay_num = 0
            # 一局游戏
            # print("GAME#:",gameNum)
            baseline = 0
            tao_prob = []
            tao_reward = []
            wait_day = []  #记录一局游戏等待哪些天

            while today < self.routeline[-1] and terminal == False:
                # 有新订单产生 (当订单数已满10个时,此处不会收到新订单)
                if order_accepted == False:
                    self.orderSelect(self.routeline, 60)
                    # print(self.order)
                    env.setOrder(self.order)
                    order_accepted = True
                # 遍历self.orders(即state[0])字典,对每一个订单操作
                state = env.getState()

                # 当前状态
                state_tf = np.mat(state)
                # print(state_tf,len(state_tf))
                # 由神经网络选择行动
                if random.random() < epsilon and isExploration == False:
                    isExploration = True
                    # end_date = random.randrange(env.getTodayIndex(),87,1)
                    end_date = 60

                if isExploration:
                    if env.getTodayIndex() == end_date:
                        action_model = 1
                        if ex_steps > 0:
                            ex_steps -= 1
                    else:
                        action_model = 0
                else:
                    #action from learning
                    action_model = brain.choose_action(state_tf)
                    # print(action_model)

                wait_day.append(env.getTodayIndex())
                # 订单字典 历史曲线 reward

                reward = env.getReward(action_model)
                tao_reward.append(reward)
                # 订单完成或者到最后一天
                terminal = env.isTerminal(action_model)
                state_ = env.getNextState(action_model)
                if len(state_) == 1:
                    state_ = copy.deepcopy(state)
                brain.store_transition(state, action_model, reward, state_)
                # profitAdvanced_list.append(td_error[0][0])

                if brain.pointer > brain.MEMORY_CAPACITY:
                    # print(b_s_)
                    brain.learn()

                total_steps += 1
                if terminal:
                    # wait_list.append(wait_day[-1])
                    # loss = brain.learn()
                    # Loss_list.append(loss)
                    break

                # step 过一天加一
                env.nextStep()

            # 一局的总收益
            epsilon = self.epsilon * (ex_steps / 500)
            print("epsilon:", epsilon)
            print("TD_Error:", baseline)
            profit = env.getTotalReward()
            profit_list.append(profit)
            print("total_steps:", total_steps)
            print("profit_list", profit_list)
            print("profit:", profit, "profitAvg:", np.mean(profit_list))
            print("action-prob:", tao_prob)
            print("Reward:", tao_reward)
            print("wait_day:", wait_day)
            self.writeHistory('./picture/history.txt', epsilon, baseline,
                              total_steps, profit_list, profit, tao_prob,
                              tao_reward, wait_day, gameNum)

            print("########################" + str(gameNum) +
                  "###########################")
            if len(profit_list) >= gameSplit:
                plt.figure()
                plt.plot(profit_list, 'r-')
                plt.savefig('./picture/' + str(gameNum) +
                            'liner_profit_PG.jpg')
                plt.figure()
                plt.scatter(np.arange(gameSplit), profit_list)
                plt.savefig('./picture/' + str(gameNum) +
                            'scatter_profit_PG.jpg')
                plt.figure()
                plt.plot(profitAdvanced_list, 'g-')
                plt.savefig('./picture/' + str(gameNum) +
                            'liner_advanced_PG.jpg')
                plt.figure()
                plt.plot(Loss_list, 'y-')
                plt.savefig('./picture/' + str(gameNum) + 'liner_loss_PG.jpg')
                plt.figure()
                plt.scatter(np.arange(gameSplit), wait_list, c='r')
                plt.savefig('./picture/' + str(gameNum) +
                            'scatter_waitDay_PG.jpg')
            if len(profit_list) >= 500:
                profit_list.clear()
                wait_list.clear()
import copy
import pylab
import numpy as np
import tensorflow as tf
from Environment import Env
from Agent import PG
from Agent import TUC
import pickle

np.random.seed(0)
EPISODES = 50

env = Env()
agent = PG()

EP_reward_sums, episodes = [], []
#agent.save_model("./model_init/PG1")
agent.load_model("./model_init/PG1")

# Session settings
GPU_mem_ratio = 0.2
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=GPU_mem_ratio)
sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

# Create recomposed transition critic
state_dim = 22
hidden_dim = 3
critic_hidden_dim = 2
action_dim = 5
tuc = TUC(sess, "TUC", state_dim, action_dim, 0.003)
#tuc.save_model("./model_init/TUC1")
Exemple #21
0
from Environment import Env
import numpy as np
import pandas as pd

initial_region_state=[15,15,15,15]
capacity_each_step=10
max_episode=5
car_count=1
need=pd.read_csv('../fake_4region_trip_20170510.csv')

env = Env(initial_region_state, capacity_each_step, max_episode, car_count, need)
NUM_ACTIONS = (2 * env.capacity_each_step + 1) * env.region_count  # [-500,500]*4个方块
NUM_STATES = 2 * env.region_count + 7  # MountainCar-v0: (2,)

history_dict={0: dict(),1: dict(),2: dict(),3:dict(),4:dict(),5: dict(),6: dict(),7: dict()}
history_action={0: dict(),1: dict(),2: dict(),3:dict(),4:dict(),5: dict(),6: dict(),7: dict()}

state = env.init()
print(state)

for action in range(NUM_ACTIONS):
    env.reset()
    env.pre_step()
    move = action % (2 * env.capacity_each_step + 1) - env.capacity_each_step
    region = int(np.floor(action / (2 * env.capacity_each_step + 1)))
    if env.check_feasible(env.state,region,0,move):
        state,reward, recent_R=env.step(region,0,move)
        if (state in history_dict[0] and history_dict[0][state] < reward) \
                or state not in history_dict[0]:
            history_dict[0][state] = (reward,recent_R)  #记录 state->reward R
            history_action[0][state] =(move,region,reward) #记录 state->move region reward