Exemple #1
0
def main():

    p1 = Agent()
    p2 = Agent()

    e = Env()

    p1.setSymbol(e.x)
    p2.setSymbol(e.o)

    p1.setV(e.initValues(p1.symbol))
    p2.setV(e.initValues(p2.symbol))

    for i in range(10000):
        if i % 1000 == 0:
            print "epoch: {}".format(i)
        play_game(p1, p2, Env())
    print "Training Complete"

    human = Human()
    human.set_symbol(e.o)
    p1.verbose = True

    while True:
        play_game(p1, human, Env(), draw=True)

        answer = raw_input("Pay again? [y,n] :")
        if answer and answer.lower()[0] == 'n':
            break
Exemple #2
0
def testEnv():
    env = Env()
    channelThroughPut = 0  # fraction of time that packets are successfully delivered over the channel
    # i.e no collisions or idle time slots
    for iteration in range(config.Iterations):
        for t in range(config.TimeSlots):
            initialState = env.reset()
            for user in range(config.N):
                action = slottedAlohaProtocol()
                env.step(action=action, user=user)
                # each user changes the inner state of the environment where the environment uses the inner state
                # in order to keep track on the channels and the ACK signals for each user
            nextStateForEachUser, rewardForEachUser = env.getNextState()
            # if a reward is one that means that a packets was successfully delivered over the channel
            # the sum has a maximum of the number of channels -> config.K
            channelThroughPut = channelThroughPut + np.sum(rewardForEachUser)
    # measuring the expected value
    channelThroughPut = channelThroughPut / (config.Iterations *
                                             config.TimeSlots)
    print("Channel Utilization average {}".format(channelThroughPut))
    ToPlotX = range(config.Iterations * config.TimeSlots)
    ToPlotY = np.ones_like(ToPlotX) * channelThroughPut
    plot_graph(data=[ToPlotX, ToPlotY],
               filename="Aloha",
               title="Aloha",
               xlabel="Time slot",
               ylabel="Average channel utilization",
               legend="SlottedAloha")


#
#
# def testTimeEnv():
#     env = TimeDependentEnv()
#     channelThroughPut = 0  # fraction of time that packets are successfully delivered over the channel
#     # i.e no collisions or idle time slots
#     for iteration in range(config.Iterations):
#         TimeSPU = env.reset()
#         for t in range(config.TimeSlots):
#             env.resetTimeStep()
#             #  reset the internal state of the environment
#             #  which keep tracks of the users actions through out the time step
#             for user in range(config.N):
#                 action = slottedAlohaProtocol()
#                 env.step(action=action, user=user)
#                 # each user changes the inner state of the environment where the environment uses the inner state
#                 # in order to keep track on the channels and the ACK signals for each user
#             nextStateForEachUser, rewardForEachUser = env.tstep(timestep=t)
#             # if a reward is one that means that a packets was successfully delivered over the channel
#             # the sum has a maximum of the number of channels -> config.K
#             channelThroughPut = channelThroughPut + np.sum(rewardForEachUser)
#     # measuring the expected value
#     channelThroughPut = channelThroughPut / (config.Iterations * config.TimeSlots)
#     print("Channel Utilization average {}".format(channelThroughPut))
#     ToPlotX = range(config.Iterations * config.TimeSlots)
#     ToPlotY = np.ones_like(ToPlotX) * channelThroughPut
#     plot_graph(data=[ToPlotX, ToPlotY], filename="Aloha", title="Aloha",
#                xlabel="Time slot", ylabel="Average channel utilization", legend="SlottedAloha")
Exemple #3
0
def env():
    config = ConfigParser()
    config.read(["data.ini"])
    # 1.使用环境变量读取的方式,需要吧evn设置为 data.ini下的环境名,并将"env":"test_env"设置到环境变量中
    api_root_url = config[os.environ['env']]['api_root_url']
    # 2.直接读取
    # api_root_url = config['test_env']['api_root_url']
    yield Env(api_root_url=api_root_url,
              username=os.environ["username"],
              password=os.environ["password"])
Exemple #4
0
def test_agents():
    result = np.zeros([5, 5])
    maps, trials_per_map = 10, 10
    ave_cost_1 = []
    ave_cost_2 = []
    ave_cost_3 = []
    for j in range(maps):
        en = Env(50)
        cost_1 = []
        cost_2 = []
        cost_3 = []
        # cost_4 = []
        for k in range(trials_per_map):
            print(f'map: {j + 1}/{maps}, play:  {k + 1}/{trials_per_map}')
            # en.set_target_on_type(i)
            en.set_target()
            en.print_target()

            agent_1 = Agent(en)
            searches_1, distance_1 = agent_1.run(1, False)
            sum_1 = searches_1 + distance_1
            cost_1.append(sum_1)

            agent_2 = Agent(en)
            searches_2, distance_2 = agent_2.run(2, False)
            sum_2 = searches_2 + distance_2
            cost_2.append(sum_2)

            agent_3 = Agent(en)
            searches_3, distance_3 = agent_3.run_improved(10000)
            sum_3 = searches_3 + distance_3
            cost_3.append(sum_3)

        ave_cost_1.append(sum(cost_1) / len(cost_1))
        ave_cost_2.append(sum(cost_2) / len(cost_2))
        ave_cost_3.append(sum(cost_3) / len(cost_3))
    result[0][1] = sum(ave_cost_1) / len(ave_cost_1)
    result[0][2] = sum(ave_cost_2) / len(ave_cost_2)
    result[0][3] = sum(ave_cost_3) / len(ave_cost_3)
    print(result)
import copy
import pylab
import numpy as np
import tensorflow as tf
from Environment import Env
from Agent import PG
from Agent import TUC
import pickle

np.random.seed(0)
EPISODES = 50

env = Env()
agent = PG()

EP_reward_sums, episodes = [], []
#agent.save_model("./model_init/PG1")
agent.load_model("./model_init/PG1")

# Session settings
GPU_mem_ratio = 0.2
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=GPU_mem_ratio)
sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

# Create recomposed transition critic
state_dim = 22
hidden_dim = 3
critic_hidden_dim = 2
action_dim = 5
tuc = TUC(sess, "TUC", state_dim, action_dim, 0.003)
#tuc.save_model("./model_init/TUC1")
Exemple #6
0
        self.nb_episodes_random = 100
        self.nb_episodes = 100
        self.batch_size = 64
        self.mission_file = './maze.xml'
        self.memory_capacity = 100000
        self.gamma = 0.99
        self.learning_rate = 0.001
        self.epsilon = 0.2
        self.huber_loss_delta = 2.0
        self.update_target_frequency = 25
        self.max_epsilon = 0.7
        self.min_epsilon = 0.1
        self.decreasing_rate = -math.log(0.01) / self.nb_episodes


hps = HPS()
plt.plot(hps.min_epsilon + (hps.max_epsilon - hps.min_epsilon) *
         np.exp(-hps.decreasing_rate * np.arange(hps.nb_episodes)))

env = Env(hps.mission_file)
randomAgent = RandomAgent(hps)
play(env, hps, randomAgent, hps.nb_episodes_random, train=True)

Agent = DDQNPER_Agent(hps)
Agent.memory = randomAgent.memory = Agent.memory
##Agent.load()
##Agent.save()
#Agent.epsilon = 0.15
play(env, hps, Agent, hps.nb_episodes, train=True, save_victory=False)
#play(env, hps, Agent, 40, train=False, save_victory=True)
#plt.plot(Agent.losses)
def empezarPrueba():
    env = Env()
    env.width = 10
    env.height = 6
    env.posY = 6

    # QTable : contiene los valores Q para cada par (estado, acción)
    qtable = np.random.rand(env.stateCount, env.actionCount).tolist()

    epochs = 100  # Iteraciones que realizará el algoritmo
    gamma = 0.8  # Valor que reduce las rescompensas de forma exponencial según pasan las acciones
    epsilon = 0.1
    decay = 0.1

    print("Mapa inicial")
    # Generamos el mapa del problema en concreto
    env.crearMapaPrueba()

    for i in range(epochs):
        # Reinicia el algoritmo en cada inicio de las iteraciones
        state, reward, done = env.reset()
        steps = 0

        # Mientras que el algoritmo no llegue al estado final propuesto seguimos realizando acciones
        # y cambiando de estado
        while not done:
            print("epoch #", i + 1, "/", epochs)
            time.sleep(0.05)
            # Dibuja la nueva posición de la A en el mapa
            env.modificaMapa(i + 1)
            # Cuenta los pasos realizados hasta llegar el final
            steps += 1

            # Cuando el parametro epsilon es mayor al numero generado automaticamente se realiza una acción
            # aleatoria
            if np.random.uniform() < epsilon:
                action = env.randomAction()
            # Si no selecciona la acción dependiendo de el numero mayor que encontramos en la tabla
            else:
                action = qtable[state].index(max(qtable[state]))

            # Calcula el siguiente estado, la recompensa obtenido y si el algoritmo acabó la iteración
            next_state, reward, done = env.step(action)

            # Hace que la recompensa sea menor en cada iteración
            reward = reward - (steps * 0.3)

            # Actualizamos la q-tabla con los valores de la ecuación de bellman
            pos = reward * (gamma**steps / 2) + 0.9 * max(qtable[next_state])
            qtable[state][action] = pos

            # Si el algoritmo acaba dibuja la A en la posición final
            if done:
                env.fin(i + 1)

            # Muestra el mapa final por Tkinter
            if done and i + 1 == epochs:
                time.sleep(30)
                tk.mainloop()

            # Actualizamos el estado
            state = next_state
        # Epsilon se reduce en cada iteración para que el algoritmo haga menos elecciones aleatorias
        epsilon -= decay * epsilon

        print("\nDone in", steps, "steps".format(steps))
        time.sleep(0.8)
Exemple #8
0
import random
from Environment import MetaEnvironment as Env

f = open("query.txt")

queryList = []
for line in f.readlines():
    line = line.strip()
    queryList.append(line)

env = Env(5)

for i in range(5):
    traceList = queryList[i * 10:(i + 1) * 10]
    state = env.state(traceList)
    print(state)
    moveList = [
        random.randint(0, env.server_num - 1) for _ in range(len(env.nodes))
    ]
    env.take_actions(moveList)
    print('Loc:', env.locality())
    print('Load:', env.load())
Exemple #9
0
from tqdm import tqdm

from collections import namedtuple
StateVars = namedtuple('state_vars',
                       ['curr_state', 'prev_state_hash', 'reward'])
from Environment import State, Env

need = pd.read_csv('../fake_4region_trip_20170510.csv')
# dist=pd.read_csv('fake_4region_distance.csv')
# dist=dist.values
eps_num = 4
car_num = 1

env = Env(initial_region_state=[15, 15, 15, 15],
          capacity_each_step=10,
          max_episode=eps_num,
          car_count=car_num,
          need=need)
history = {i: dict() for i in range(8)}

for region in range(env.region_count):
    state = env.new_state()
    curr_state_hash = state.get_hash()
    state.out_stage()
    for car in range(env.car_num):
        for move in range(-env.capacity_each_step, env.capacity_each_step + 1):
            if state.check_feasible(region, car, move):

                new_state = state.step(region, car, move)
                new_state.in_stage()
                new_state_hash = new_state.get_hash()
Exemple #10
0
from Environment import Env
import numpy as np
import pandas as pd

initial_region_state=[15,15,15,15]
capacity_each_step=10
max_episode=5
car_count=1
need=pd.read_csv('../fake_4region_trip_20170510.csv')

env = Env(initial_region_state, capacity_each_step, max_episode, car_count, need)
NUM_ACTIONS = (2 * env.capacity_each_step + 1) * env.region_count  # [-500,500]*4个方块
NUM_STATES = 2 * env.region_count + 7  # MountainCar-v0: (2,)

history_dict={0: dict(),1: dict(),2: dict(),3:dict(),4:dict(),5: dict(),6: dict(),7: dict()}
history_action={0: dict(),1: dict(),2: dict(),3:dict(),4:dict(),5: dict(),6: dict(),7: dict()}

state = env.init()
print(state)

for action in range(NUM_ACTIONS):
    env.reset()
    env.pre_step()
    move = action % (2 * env.capacity_each_step + 1) - env.capacity_each_step
    region = int(np.floor(action / (2 * env.capacity_each_step + 1)))
    if env.check_feasible(env.state,region,0,move):
        state,reward, recent_R=env.step(region,0,move)
        if (state in history_dict[0] and history_dict[0][state] < reward) \
                or state not in history_dict[0]:
            history_dict[0][state] = (reward,recent_R)  #记录 state->reward R
            history_action[0][state] =(move,region,reward) #记录 state->move region reward
Exemple #11
0
    def transcate_DDPG(self):
        BATCH_SIZE = 32
        total_steps = 0  # 记录步数,一天是一步
        profit_list = []  # 记录每局总收益
        profitAdvanced_list = []
        actions = 2  # 行动个数
        s_dim = 87
        a_dim = 1
        brain = DDPG(
            a_dim=a_dim,
            s_dim=s_dim,
            a_bound=1.,
            LR_A=0.001,
            LR_C=0.001,
            GAMMA=.99,
            TAU=0.01,
            # replacement=REPLACEMENT,
        )
        gameNum = 0  #记录游戏轮数
        ex_steps = 500  #探索衰减的轮数
        epsilon = self.epsilon
        last_remainder = 0
        reward_list = [0]  #存储每次的收益,来计算baseline
        Loss_list = []  #存储训练过程中的损失值
        wait_list = []  #记录N轮游戏分别等待天数
        gameSplit = 5000  #每多少轮游戏画图
        while total_steps < 60000:
            # 初始化游戏
            # routeId = random.randrange(0, 49, 1)
            routeId = 21
            self.routeline = self.allRoute[routeId]
            # print(self.routeline)
            env = Env(self.routeline)
            gameNum += 1
            # state = env.getState()  # 以state[0]、state[1]方式访问
            today = env.getToday()

            terminal = False
            order_accepted = False
            isExploration = False
            create_date = 1
            end_date = 0
            stay_num = 0
            # 一局游戏
            # print("GAME#:",gameNum)
            baseline = 0
            tao_prob = []
            tao_reward = []
            wait_day = []  #记录一局游戏等待哪些天

            while today < self.routeline[-1] and terminal == False:
                # 有新订单产生 (当订单数已满10个时,此处不会收到新订单)
                if order_accepted == False:
                    self.orderSelect(self.routeline, 60)
                    # print(self.order)
                    env.setOrder(self.order)
                    order_accepted = True
                # 遍历self.orders(即state[0])字典,对每一个订单操作
                state = env.getState()

                # 当前状态
                state_tf = np.mat(state)
                # print(state_tf,len(state_tf))
                # 由神经网络选择行动
                if random.random() < epsilon and isExploration == False:
                    isExploration = True
                    # end_date = random.randrange(env.getTodayIndex(),87,1)
                    end_date = 60

                if isExploration:
                    if env.getTodayIndex() == end_date:
                        action_model = 1
                        if ex_steps > 0:
                            ex_steps -= 1
                    else:
                        action_model = 0
                else:
                    #action from learning
                    action_model = brain.choose_action(state_tf)
                    # print(action_model)

                wait_day.append(env.getTodayIndex())
                # 订单字典 历史曲线 reward

                reward = env.getReward(action_model)
                tao_reward.append(reward)
                # 订单完成或者到最后一天
                terminal = env.isTerminal(action_model)
                state_ = env.getNextState(action_model)
                if len(state_) == 1:
                    state_ = copy.deepcopy(state)
                brain.store_transition(state, action_model, reward, state_)
                # profitAdvanced_list.append(td_error[0][0])

                if brain.pointer > brain.MEMORY_CAPACITY:
                    # print(b_s_)
                    brain.learn()

                total_steps += 1
                if terminal:
                    # wait_list.append(wait_day[-1])
                    # loss = brain.learn()
                    # Loss_list.append(loss)
                    break

                # step 过一天加一
                env.nextStep()

            # 一局的总收益
            epsilon = self.epsilon * (ex_steps / 500)
            print("epsilon:", epsilon)
            print("TD_Error:", baseline)
            profit = env.getTotalReward()
            profit_list.append(profit)
            print("total_steps:", total_steps)
            print("profit_list", profit_list)
            print("profit:", profit, "profitAvg:", np.mean(profit_list))
            print("action-prob:", tao_prob)
            print("Reward:", tao_reward)
            print("wait_day:", wait_day)
            self.writeHistory('./picture/history.txt', epsilon, baseline,
                              total_steps, profit_list, profit, tao_prob,
                              tao_reward, wait_day, gameNum)

            print("########################" + str(gameNum) +
                  "###########################")
            if len(profit_list) >= gameSplit:
                plt.figure()
                plt.plot(profit_list, 'r-')
                plt.savefig('./picture/' + str(gameNum) +
                            'liner_profit_PG.jpg')
                plt.figure()
                plt.scatter(np.arange(gameSplit), profit_list)
                plt.savefig('./picture/' + str(gameNum) +
                            'scatter_profit_PG.jpg')
                plt.figure()
                plt.plot(profitAdvanced_list, 'g-')
                plt.savefig('./picture/' + str(gameNum) +
                            'liner_advanced_PG.jpg')
                plt.figure()
                plt.plot(Loss_list, 'y-')
                plt.savefig('./picture/' + str(gameNum) + 'liner_loss_PG.jpg')
                plt.figure()
                plt.scatter(np.arange(gameSplit), wait_list, c='r')
                plt.savefig('./picture/' + str(gameNum) +
                            'scatter_waitDay_PG.jpg')
            if len(profit_list) >= 500:
                profit_list.clear()
                wait_list.clear()
Exemple #12
0
    def transcate_PG(self):
        total_steps = 0  # 记录步数,一天是一步
        profit_list = []  # 记录每局总收益
        profitAdvanced_list = []
        actions = 2  # 行动个数
        brain = PolicyGradient(
            n_actions=2,
            n_features=87,
            learning_rate=0.1,
            reward_decay=1,
        )
        gameNum = 0  #记录游戏轮数
        ex_steps = 500  #探索衰减的轮数
        epsilon = self.epsilon
        last_remainder = 0
        reward_list = [0]  #存储每次的收益,来计算baseline
        Loss_list = []  #存储训练过程中的损失值
        wait_list = []  #记录等待天数
        gameSplit = 500  #每多少轮游戏画图
        while total_steps < 60000:
            # 初始化游戏
            # routeId = random.randrange(0, 49, 1)
            routeId = 21
            self.routeline = self.allRoute[routeId]
            # print(self.routeline)
            env = Env(self.routeline)
            gameNum += 1
            # state = env.getState()  # 以state[0]、state[1]方式访问
            today = env.getToday()

            terminal = False
            order_accepted = False
            isExploration = False
            create_date = 1
            end_date = 0
            stay_num = 0
            # 一局游戏
            # print("GAME#:",gameNum)
            baseline = 0
            tao_prob = []
            tao_reward = 0
            wait_day = []

            while today < self.routeline[-1] and terminal == False:
                # 有新订单产生 (当订单数已满10个时,此处不会收到新订单)
                if order_accepted == False:
                    self.orderSelect(self.routeline, 60)
                    # print(self.order)
                    env.setOrder(self.order)
                    order_accepted = True
                    # print(self.order[1])
                # 遍历self.orders(即state[0])字典,对每一个订单操作
                state = env.getState()

                # 当前状态
                state_tf = np.mat(state)
                # print(state_tf,len(state_tf))
                # 由神经网络选择行动
                if random.random() < epsilon and isExploration == False:
                    isExploration = True
                    end_date = random.randrange(env.getTodayIndex(), 87, 1)
                    # end_date = 60

                if isExploration:
                    if env.getTodayIndex() == end_date:
                        action_model = 1
                        if ex_steps > 0:
                            ex_steps -= 1
                    else:
                        action_model = 0
                else:
                    #action from learning
                    action_model, p = brain.choose_action(
                        state_tf, env.getTodayIndex())
                    tao_prob.append(p)
                if action_model == 0:
                    action_finishOrder = [1, 0]
                else:
                    action_finishOrder = [0, 1]

                # 订单字典 历史曲线 reward

                reward = env.getReward(action_model)

                # 订单完成或者到最后一天
                terminal = env.isTerminal(action_model)
                if terminal:
                    tmp = reward
                    baseline = np.mean(reward_list)
                    profitAdvanced_list.append(baseline)
                    reward -= baseline
                    reward_list.append(tmp)
                    # print("END_REWARD:",reward,",reward_list:",reward_list)
                # 保存记录到记忆库
                # print("this is store arg:",state_tf,";", action_model,";", reward,";", env.getTodayIndex())
                brain.store_transition(state_tf, action_model, reward,
                                       env.getTodayIndex())
                # print(action_model)

                total_steps += 1
                if terminal:
                    loss, wait_day, tao_reward = brain.learn()
                    Loss_list.append(loss)
                    wait_list.append(wait_day[-1])
                    break

                # step 过一天加一
                env.nextStep()

            # 一局的总收益
            epsilon = self.epsilon * (ex_steps / 500)
            print("epsilon:", epsilon)
            print("Baseline:", baseline)
            profit = env.getTotalReward()
            profit_list.append(profit)
            print("total_steps:", total_steps)
            print("profit_list", profit_list)
            print("profit:", profit, "profitAvg:", np.mean(profit_list))
            print("action-prob:", tao_prob)
            print("Reward:", tao_reward)
            print("wait_day:", wait_day)
            self.writeHistory('./picture/history.txt', epsilon, baseline,
                              total_steps, profit_list, profit, tao_prob,
                              tao_reward, wait_day, gameNum)

            print("########################" + str(gameNum) +
                  "###########################")
            if len(profit_list) >= gameSplit:
                plt.figure()
                plt.plot(profit_list, 'r-')
                plt.savefig('./picture/' + str(gameNum) +
                            'liner_profit_PG.jpg')
                plt.figure()
                plt.scatter(np.arange(gameSplit), profit_list)
                plt.savefig('./picture/' + str(gameNum) +
                            'scatter_profit_PG.jpg')
                plt.figure()
                plt.plot(profitAdvanced_list, 'g-')
                plt.savefig('./picture/' + str(gameNum) +
                            'liner_advanced_PG.jpg')
                plt.figure()
                plt.plot(Loss_list, 'y-')
                plt.savefig('./picture/' + str(gameNum) + 'liner_loss_PG.jpg')
                plt.figure()
                plt.scatter(np.arange(gameSplit), wait_list, c='r')
                plt.savefig('./picture/' + str(gameNum) +
                            'scatter_waitDay_PG.jpg')
                profit_list.clear()
                wait_list.clear()
Exemple #13
0
    utility_next = utility_matrix[next_state]
    delta = reward + gamma * utility_next - utility
    utility_matrix[state] += alpha * (delta)

    return utility_matrix, delta


def updateActor(state_action_matrix, state, action, delta):
    beta = 1
    state_action_matrix[state, action] += beta * delta

    return state_action_matrix


if __name__ == '__main__':
    environment = Env()

    alpha = 0.1
    gamma = 0.99
    epsilon = 0.01

    number_of_episodes = 10000

    state_action_pairs = numpy.full(
        (environment.normalizedtree.getNumberOfNodes(), 2), 0.5)
    utility_matrix = numpy.zeros(
        [environment.normalizedtree.getNumberOfNodes()])

    softmax = lambda vals: numpy.exp(vals - numpy.max(vals)) / numpy.sum(
        numpy.exp(vals - numpy.max(vals)))