Ejemplo n.º 1
0
def run_testcase(filename):
    # find destinations in folder starting_... and ending_...
    finder = findDestinations(filename)
    end = finder.returnDestination()
    start = finder.returnStarting()

    map_file = np.loadtxt('map.txt',dtype=int)

    # bounding negative values for keeping it in bounds
    map_file[0,:] = MIN_VALUE
    map_file[:,0] = MIN_VALUE
    map_file[:,len(map_file)-1]=MIN_VALUE
    map_file[len(map_file)-1,:]=MIN_VALUE
    
    # UAV map emulation
    env = Map(start,end,filename,map_file)
    RL = DeepQNetwork(env.n_actions, env.n_features,
                    learning_rate=0.01,
                    reward_decay=0.9,
                    e_greedy=0.9,
                    replace_target_iter=200,
                    memory_size=2000,
                    output_graph=True,
                    iteration=filename
                    )
    run_map(filename,RL,env)  
    RL.plot_cost()
    compare = Compare(filename)
    #compare to given results
    print("Finished iteration", filename)
Ejemplo n.º 2
0
 def __init__(self):
     self.gameStart=False
     self.status=False
     self.reward=0
     super(view, self).__init__()
     self.n_actions = 361    #定义动作的可能个数
     self.n_features = 361
     self.doneList=[]
     self.allphoto=[]
     self.initView()
     self.env=env()
     self.wobservation=None
     self.wobservation_=None
     self.action1=None
     self.RL = DeepQNetwork(self.n_actions, self.n_features )
Ejemplo n.º 3
0
def train(**kwargs):
    for k_, v_ in kwargs.items():
       setattr(opt, k_, v_) 

    RL = DeepQNetwork(env.n_actions, env.n_features, opt)
    env.after(100, update_dqn(RL))
    env.mainloop()
Ejemplo n.º 4
0
    def __init__(self):
        start_table = dict()
        end_table = dict()
        self.RL = DeepQNetwork(n_actions,
                               n_features,
                               learning_rate=0.01,
                               reward_decay=0.9,
                               e_greedy=0.9,
                               replace_target_iter=200,
                               memory_size=2000,
                               output_graph=False,
                               testing=False)

        filename = "test_destinations.txt"
        f = open(filename, "r")

        for line in f:
            nums = line.split(';')
            start = nums[0].split(',')
            end_ = nums[1].split(',')

            start = [0, 0]
            end = [0, 0]
            start[0] = int(start[0])
            start[1] = int(start[1])
            end[0] = int(end_[0])
            end[1] = int(end_[1])

            start_table[start[0]] = start[1]
            end_table[end[0]] = end[1]

        # Training Time keeping
        total_time = 0
        start = time.time()

        # train on 25 samples
        self.run_training(150, start_table, end_table)

        # Training Time keeping
        total_time = (time.time() -
                      start) / 60  # print minutes to train on 100 samples
        time_file = "trainTime.txt"
        f = open(time_file, "w+")
        f.write(str(total_time))
        f.close()
Ejemplo n.º 5
0
 def __init__(self,
              budget,
              times,
              users,
              n_scope,
              r_interval=0.01,
              isTrain=True):
     Approach.__init__(self, budget, times, users)
     self.n_scope = n_scope
     self.state_dim = 8
     self.action_dim = 9
     self.r_interval = r_interval
     if isTrain:
         self.dqn = DeepQNetwork(self.action_dim, self.state_dim)
     else:
         self.dqn = DeepQNetwork(self.action_dim,
                                 self.state_dim,
                                 e_greedy_increment=None)
def DQN():
    import tensorflow as tf
    from DQN import DeepQNetwork
    import numpy as np

    game.restart_game()

    tf.reset_default_graph()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)

    dqn = DeepQNetwork(sess, game)

    game_state = game.current_state()

    start_state = np.concatenate(
        (game_state, game_state, game_state, game_state), axis=2)
    s_t = start_state

    while not game.game_end():
        # choose an action epsilon greedily
        _, action_index = dqn.choose_action(s_t)

        move = action_index
        game.do_move(move)

        pygame.event.pump()

        game_state = game.current_state()
        s_t = np.append(game_state, s_t[:, :, :-2], axis=2)

        screen.fill(black)

        game.snake.blit(rect_len, screen)
        game.strawberry.blit(screen)
        game.blit_score(white, screen)

        pygame.display.flip()

        fpsClock.tick(15)

    crash()
Ejemplo n.º 7
0
    def __init__(self,
                 input_dims,
                 n_actions,
                 lr,
                 mem_size,
                 batch_size,
                 epsilon,
                 gamma=0.99,
                 eps_dec=5e-7,
                 eps_min=0.01,
                 replace=1000,
                 algo=None,
                 env_name=None,
                 checkpoint_dir='tmp/dqn'):
        self.lr = lr
        self.batch_size = batch_size
        self.input_dims = input_dims
        self.n_actions = n_actions
        self.gamma = gamma
        self.epsilon = epsilon
        self.eps_dec = eps_dec
        self.eps_min = eps_min
        self.replace = replace
        self.algo = algo
        self.env_name = env_name
        self.checkpoint_dir = checkpoint_dir
        self.action_space = [i for i in range(self.n_actions)]
        self.learn_step_counter = 0

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)

        self.q_eval = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + " " + self.algo +
                                   "_q_eval",
                                   checkpoint_dir=self.checkpoint_dir)
        self.q_next = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + " " + self.algo +
                                   "_q_next",
                                   checkpoint_dir=self.checkpoint_dir)
Ejemplo n.º 8
0
    def __init__(self, gamma, epsilon, lr, n_actions, input_dims,
                    mem_size, batch_size, eps_min = 0.01, eps_dec = 5e-7,
                    replace = 10_000):
        pass
        self.gamma = gamma #used to discount future rewards
        self.epsilon = epsilon #used for epsilon-greedy action choosing algo.
        self.lr = lr #learning rate, essentially, how big of a step does the optimizer take
        self.n_actions = n_actions #number of actions available to our agent in its environment
        self.action_space = [i for i in range(n_actions)]#list comprehension to create array of indices of possible actions to choose from
        self.input_dims = input_dims #the dimensions of our input as defined by the agent's environment
        self.mem_size = mem_size #maximum amount of memories to store
        self.batch_size = batch_size #mini-batch size to sample from memory.
        self.eps_min = eps_min #smallest possible epsilon value for our agent
        self.eps_dec = eps_dec #how much to decrease epsilon each iteration
        self.replace_after = replace #how many iterations until we replace our target network with a sofy copy of our local network
        self.steps = 0 #iteration counter for use with replace_after

        #create a ReplayBuffer to store our memories, also used to sample a mini-batch
        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)

        self.Q_local = DeepQNetwork(self.lr, self.n_actions,
                                    input_dims = self.input_dims)
        self.Q_target = DeepQNetwork(self.lr, self.n_actions,
                                    input_dims = self.input_dims)
Ejemplo n.º 9
0
def main():
    env = RideHitch("data/norm1000.txt")
    print(env.requests_list)
    RL = DeepQNetwork(env.pool_size,
                      env.state_num,
                      learning_rate=0.01,
                      reward_decay=0.99,
                      e_greedy=1,
                      replace_target_iter=200,
                      memory_size=2000,
                      output_graph=False,
                      T=env.T_threshold,
                      D=env.D_threshold)
    step = 0
    matched_list = []
    for episode in range(100):
        # init
        observation = env.reset(reset_seq=False)
        # if episode % 100 == 0:
        #     print(episode)
        matched = 0
        print("seq size:", env.request_num, "pool size:", env.pool_size)
        while True:
            action = RL.choose_action(observation)
            observation_, reward, done = env.step(action)
            if reward > 0:
                matched += 1
            RL.store_transition(observation, action, reward, observation_)
            if (step > 200) and (step % 5 == 0):
                RL.learn()
            observation = observation_
            if done:
                break
            step += 1
        matched_list.append(matched)
        print("eps", episode, "matching", matched)
    # print(matched_list)
    RL.plot_cost()
Ejemplo n.º 10
0
MAX_EPISODES = 900
ON_TRAIN = True

# set env
env = DataEnv()
s_dim = env.state_dim
a_dim = env.action_dim
clf = Classifier('logistic')

# set RL method
rl = DeepQNetwork(
    a_dim,
    s_dim,
    learning_rate=0.01,
    reward_decay=0.9,
    e_greedy=0.9,
    replace_target_iter=200,
    memory_size=2000,
    # output_graph=True
)

steps = []


def train():
    # start training
    for i in range(MAX_EPISODES):
        state = env.reset(clf)
        ep_r = 0.
        while True:
            # env.render()
Ejemplo n.º 11
0
Tensorflow: 1.0
gym: 0.8.0
"""

from wsn_env import Env
from DQN import DeepQNetwork
import matplotlib.pyplot as plt
import numpy as np

env = Env()

RL = DeepQNetwork(
    n_actions=env.n_actions,
    n_features=env.n_features,
    learning_rate=0,
    e_greedy=0,
    replace_target_iter=300,
    memory_size=3000,
    e_greedy_increment=0.0002,
)

total_steps = 0
reware = []
for i_episode in range(env.times):

    observation = env.reset()

    ep_r = 0
    while True:

        action = RL.choose_action(observation)
Ejemplo n.º 12
0
            if once == False:
                once = True
                print "break out of the while loop"
                print DQN.epsilon
                print DQN.learn_step_counter
            break
            time.sleep(3.0)
        """


global DQN
DQN = DeepQNetwork(
    n_actions,
    n_features,
    learning_rate=0.03,
    reward_decay=0.9,
    replace_target_iter=150,
    memory_size=1000,
    # output_graph=True
)
t = threading.Thread(target=run)
t.daemon = True
t.start()
t.join
start_simulation()
DQN.plot_q_t()
DQN.plot_cost()

plot_values = []
accumulation = 0
for i in range(len(scores)):
Ejemplo n.º 13
0

if __name__ == '__main__':
    rewards = [[], []]  # makespan agent 和 cost agent的奖励值
    records = [[], [], []]  # makespan agent 和 cost agent的数值以及策略集合

    scaler = StandardScaler()

    env = Env(N_AGENT)
    memories = [Memory(MEMORY_SIZE) for i in range(N_AGENT)]
    memory = Memory(MEMORY_SIZE)

    dqn = [
        DeepQNetwork(env.n_actions,
                     env.n_features,
                     i,
                     learning_rate=0.0001,
                     replace_target_iter=REPLACE_TARGET_ITER,
                     e_greedy_increment=2e-5) for i in range(N_AGENT)
    ]

    run_env()

    fig, (ax0, ax1) = plt.subplots(nrows=2)
    ax0.grid(True)
    ax0.set_xlabel('episodes')
    ax0.set_ylabel('makespan metric')
    line01, = ax0.plot(rewards[0],
                       color='orange',
                       label="rewards",
                       linestyle='-')
    # line02, = ax0.plot(records[0], label = "records", linewidth=2)
Ejemplo n.º 14
0
max_steps = 1000
Env = env()
root = "./Result/"
child0 = mytyme
child1 = ['Datas/', "Src/"]
child2 = ["DQN/", 'Double-DQN/', 'Dueling-DQN/', "Double-Dueling-DQN/"]
child3 = ["data/", "map/", "memory/"]
kind = 0
envs = 1
support = Support(root=root,
                  child0=child0,
                  child1=child1,
                  child2=child2,
                  child3=child3)
kind = 3
DQN = DeepQNetwork(double_q=False, dueling_q=False, env=str(envs))
steps = 0
for i in range(30):
    start = time.time()
    support.create_csv(Env.save_title, kind=kind, i=i + 1)
    s = Env.reset()
    while not Env.done:
        action = DQN.choose_actions(s)
        s_, r, done, advise = Env.step(action)
        DQN.store_transition(s, action, r, s_)
        s = s_
        if steps > 100:
            DQN.learn()
        Env.loss = DQN.cost
        steps += 1
        Env.steps += 1
Ejemplo n.º 15
0
            if done:
                break
            step += 1
        s.append[count]

    plt.plot(np.arange(len(s)), s)
    plt.ylabel('points to goal')
    plt.xlabel('training steps')
    plt.savefig("figPtsv1.png")

    total_time = start - time.time()
    f = open("trainTime.txt", "w+")
    f.write(total_time)
    f.close()
    print('Finished')


if __name__ == "__main__":
    # maze game
    env = Map()
    RL = DeepQNetwork(env.n_actions,
                      env.n_features,
                      learning_rate=0.01,
                      reward_decay=0.9,
                      e_greedy=0.9,
                      replace_target_iter=200,
                      memory_size=2000,
                      output_graph=True)
    run_map()

RL.plot_cost()
from DQN import DeepQNetwork
import gym

env = gym.make("CartPole-v0")
env = env.unwrapped

print(env.action_space)
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)

RL = DeepQNetwork(n_actions=env.action_space.n,
                  n_features=env.observation_space.shape[0],  # (4,)[0]
                  learning_rate=.01,
                  e_greedy=.9,
                  replace_target_iter=100,
                  memory_size=2000,
                  e_greedy_increment=.001,
                  )

total_steps = 0

for episode in range(1000):
    observation = env.reset()

    ep_r = 0

    while True:
        env.render()

        action = RL.choose_action(observation)
Ejemplo n.º 17
0
if __name__ == "__main__":
    n_serverfarms = 20
    n_servers = 15
    n_vms = 5
    env = e.cloud_env(n_serverfarms, n_servers, n_vms)

    df_task_usage = read_data.read_data()
    job_queue = {}

    RL_farm = DeepQNetwork(
        n_serverfarms * 24,
        n_serverfarms * n_servers * 2 + 4,
        '_farm',
        learning_rate=0.01,
        reward_decay=0.9,
        e_greedy=0.9,
        replace_target_iter=200,
        memory_size=2000,
        # output_graph=True
    )

    RL_server = []
    for i in range(n_serverfarms):
        dqn = DeepQNetwork(
            n_servers,
            n_servers * 2 + 4,
            '_server_' + str(i),
            learning_rate=0.01,
            reward_decay=0.9,
            e_greedy=0.9,
Ejemplo n.º 18
0
            t = t + 1
        else:
            reward = match.matching(graph.left, graph.right, graph.edge)
            print("r", reward)
            l_num = len(graph.left)
            r_num = len(graph.right)
            l = 0
            state_ = np.array([
                l_num, r_num,
                match.fake_matching(graph.left, graph.right, graph.edge), l
            ])
            RL.store_transition(state, action, reward, state_)
            t = t + 1
        if (t > 200) and (t % 5 == 0):
            print("sss")
            RL.learn()
        all_reward = reward + all_reward
        print("得分", all_reward, "轮数", t)


if __name__ == '__main__':
    RL = DeepQNetwork(2,
                      4,
                      learning_rate=0.01,
                      reward_decay=0.9,
                      e_greedy=0.9,
                      replace_target_iter=200,
                      memory_size=2000,
                      output_graph=True)
    run_net()
Ejemplo n.º 19
0
class DDQNAgent():
    """
        A Double DQN agent has two networks. One local network and one target network.
        The local network is trained every iteration and is used for predictive action.
        The target network is updated to a soft copy of the local network every so often.

        The reason is because the Bellman equation would be valuing the network that is predicting
        as well as that same network being used to calculate loss. We have this separation of training
        and predicting to help the agent learn.
    """
    def __init__(self, gamma, epsilon, lr, n_actions, input_dims,
                    mem_size, batch_size, eps_min = 0.01, eps_dec = 5e-7,
                    replace = 10_000):
        pass
        self.gamma = gamma #used to discount future rewards
        self.epsilon = epsilon #used for epsilon-greedy action choosing algo.
        self.lr = lr #learning rate, essentially, how big of a step does the optimizer take
        self.n_actions = n_actions #number of actions available to our agent in its environment
        self.action_space = [i for i in range(n_actions)]#list comprehension to create array of indices of possible actions to choose from
        self.input_dims = input_dims #the dimensions of our input as defined by the agent's environment
        self.mem_size = mem_size #maximum amount of memories to store
        self.batch_size = batch_size #mini-batch size to sample from memory.
        self.eps_min = eps_min #smallest possible epsilon value for our agent
        self.eps_dec = eps_dec #how much to decrease epsilon each iteration
        self.replace_after = replace #how many iterations until we replace our target network with a sofy copy of our local network
        self.steps = 0 #iteration counter for use with replace_after

        #create a ReplayBuffer to store our memories, also used to sample a mini-batch
        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)

        self.Q_local = DeepQNetwork(self.lr, self.n_actions,
                                    input_dims = self.input_dims)
        self.Q_target = DeepQNetwork(self.lr, self.n_actions,
                                    input_dims = self.input_dims)

    def store_memory(self, state, action, reward, state_, done):
        """
            Save a new memory to our ReplayBuffer
        """
        self.memory.store_memory(state, action, reward, state_, done)

    def sample_batch(self):
        """
            Pull a stochastic mini-batch from our ReplayBuffer
        """
        state, action, reward, state_, done = \
                            self.memory.sample_batch(self.batch_size)

        states = T.tensor(state).to(self.Q_local.device)
        actions = T.tensor(action).to(self.Q_local.device)
        rewards = T.tensor(reward).to(self.Q_local.device)
        states_ = T.tensor(state_).to(self.Q_local.device)
        dones = T.tensor(done).to(self.Q_local.device)

        return states, actions, rewards, states_, dones


    def choose_action(self, observation):
        """
            Choose an action from our action space using an epsilon-greedy algorithm.
            We can either EXPLOIT, or EXPLORE based on a random probability.

            Exploiting will choose the best known action. (confidence)

            Exploring will explore a random action. This will possibly present new information to our agent
            to learn from.
        """
        if np.random.random() > self.epsilon:#epsilon-greedy (EXPLOIT)
            state = T.tensor([observation], dtype = T.float).to(self.Q_local.device)
            actions = self.Q_local.forward(state)
            action = T.argmax(actions).item()#.item() gets index from tensor
        else:#(EXPLORE)
            action = np.random.choice(self.action_space)#choose random action from our action space

        return action

    def replace_target_network(self):
        """
            after replace_after iterations we update our target network
            to be a soft copy of our local network
        """
        if self.replace_after is not None and \
                    self.steps % self.replace_after == 0:
            self.Q_target.load_state_dict(self.Q_local.state_dict())

    def decrement_epsilon(self):
        """
            decrease epsilon, but not below eps_min
        """
        self.epsilon = max(self.epsilon - self.eps_dec, self.eps_min)

    def learn(self):
        """
            Main part of our agent.

            First we zero the gradient of our optimzier to stop exploding gradients.
            Then we sample a stochastic mini-batch from our ReplayBuffer.

            Then we make predictions and evaluations of this random mini-batch, step our optimzer
            and calculate loss.

            Finally, we decrement our epsilon and begin the cycle of (SEE->DO->LEARN) once again.
        """
        if self.memory.mem_cntr < self.batch_size:#if we dont have a full batch of memories, dont learn quite yet
            return

        self.Q_local.optimizer.zero_grad()#zero out our gradient for optimzer. Stop exploding gradients

        self.replace_target_network()

        states, actions, rewards, states_, dones = self.sample_batch()

        indices = np.arange(self.batch_size)

        q_pred = self.Q_local.forward(states)[indices, actions]#local pred
        q_next = self.Q_target.forward(states_)#target pred
        q_eval = self.Q_local.forward(states_)

        max_actions = T.argmax(q_eval, dim = 1)
        q_next[dones] = 0.0#set to not done

        q_target = rewards + self.gamma*q_next[indices, max_actions]#bellman equation
        loss = self.Q_local.loss(q_target, q_pred).to(self.Q_local.device)
        loss.backward()#back-propagation

        self.Q_local.optimizer.step()
        self.steps += 1

        self.decrement_epsilon()

    def save_agent(self):
        self.Q_local.save_model('local')
        self.Q_target.save_model('target')

    def load_agent(self):
        self.Q_local.load_model('local')
        self.Q_target.load_model('target')
Ejemplo n.º 20
0
config = load_config(config_path)

reward_list = []

RL = None
for n_agent in range(1, 13):
    for seed in range(1, 13):

        env = Select(params=config['env'], n_agent=n_agent, attack_mode=args.attack_mode)
        # 解决for循环模型重加载问题
        tf.reset_default_graph()
        RL = DeepQNetwork(env.n_action, env.n_agent,
                          seed=seed,
                          learning_rate=args.learning_rate,
                          reward_decay=args.reward_delay,  # 更注重短期奖励还是长期奖励
                          replace_target_iter=args.replace_target_iter,
                          memory_size=args.memory_size,
                          output_graph=args.output_graph,
                          n_input=n_agent,
                          prioritized=args.prioritized
                          )
        reward_max = 0
        observation_max = []
        episode_max = 0
        # reward_last = 0
        # observation_last = []
        # episode_last = 0
        reward = 0
        step = 0
        print('===========' + ' start train! ' + '===========')
        print()
        for episode in range(args.max_episode):
Ejemplo n.º 21
0
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import time
import numpy as np
import random
# Global variables
MIN_VALUE = -10000
IMG_SIZE = 300
action_space = ['u', 'd', 'l', 'r', 'ur', 'rd', 'ld', 'ul']
n_actions = len(action_space)
n_features = 2
RL = DeepQNetwork(n_actions,
                  n_features,
                  learning_rate=0.01,
                  reward_decay=0.9,
                  e_greedy=0.9,
                  replace_target_iter=200,
                  memory_size=2000,
                  output_graph=False,
                  testing=True)


def init_map(filename):
    map_file = np.loadtxt('map.txt', dtype=int)
    # bounding negative values for keeping it in bounds
    map_file[0, :] = MIN_VALUE
    map_file[:, 0] = MIN_VALUE
    map_file[:, len(map_file) - 1] = MIN_VALUE
    map_file[len(map_file) - 1, :] = MIN_VALUE
    return map_file
Ejemplo n.º 22
0
print(str(env))

brain_name = env.external_brain_names[0]

tf.reset_default_graph()

summary_path = "./summaries/{}".format(run_path)

if not os.path.exists(summary_path):
    os.makedirs(summary_path)

##Q2 Start
RL = DeepQNetwork(4,
                  8,
                  learning_rate=0.001,
                  reward_decay=0.99,
                  e_greedy=0.975,
                  replace_target_iter=4,
                  memory_size=10000,
                  e_greedy_increment=None)
##Q2 End

init = tf.global_variables_initializer()

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
summary_writer = tf.summary.FileWriter(summary_path)


def PrintAndSaveSummary(writer, episode, episodeStep, episodeReward, epsilon,
                        lr):
    print('Epsisode:', episode, 'length =', episodeStep)
                    print(
                        'episode:' + str(episode) + ' steps:' + str(step) + ' reward:' + str(
                            rwd) + ' eps_greedy:' + str(
                            dqn.epsilon))
                rewards.append(rwd)
                break


if __name__ == '__main__':
    rewards = []

    env = Env(N_VM)

    memories = Memory(MEMORY_SIZE)

    dqn = DeepQNetwork(env.n_actions, env.n_features,
                       learning_rate=0.001,
                       replace_target_iter=200,
                       e_greedy_increment=3e-5
                       )

    run_env(EPISODES, MINI_BATCH)

    dqn.plot_cost()

    plt.plot(np.arange(len(rewards)), rewards)
    plt.plot(np.arange(len(rewards)), [138 for i in range(len(rewards))])
    plt.ylabel('reward')
    plt.xlabel('episode')
    plt.show()
Ejemplo n.º 24
0
    return action[0]

# DQN
def action_transform(pp_obser):
    return [pp_obser['pp'], pp_obser['n']]

env = env.unwrapped

print(env.action_space)
print(env.observation_space)
#print(env.observation_space.high)
#print(env.observation_space.low)

RL = DeepQNetwork(n_actions=env.action_space.n,
                  n_features=2,
                  learning_rate=alpha, reward_decay=gamma, e_greedy=0.9,
                  replace_target_iter=100, memory_size=2000,
                  e_greedy_increment=0.01,)
total_steps = 0




best_reward=0
best_pp=None
reward_list=[]
for episode in range(nEpisodes):
    # DQN
    observation, info = env.reset()
    # frame=info["frame"]
    ep_r = 0
Ejemplo n.º 25
0
            # break while loop when end of this episode
            if RL.memory_counter > MEMORY_CAPACITY:
                RL.learn()
            if done:
                break

            step += 1

            state = state_

        print('episode is:', episode)

    # end of game
    print('training over')

    # end of game
    print('game over')


if __name__ == "__main__":
    size = 12
    m = 6
    MEMORY_CAPACITY = 4000
    board = Board(size, m)
    RL = DeepQNetwork(size**2, size**2)

    train(1)

    RL.save_net()
    print('Finish')
Ejemplo n.º 26
0
            observation_, reward, done = env.step(action)

            RL.store_transition(observation, action, reward, observation_)

            if (step > 200) and (step % 5 == 0):
                RL.learn()

            # swap observation
            observation = observation_

            # break while loop when end of this episode
            if done:
                break
            step += 1

    # end of game
    print('game over')
    # env.destroy()

if __name__ == "__main__":
    env = Environment(rule=Rule())
    RL = DeepQNetwork(list(range(41)), 2,
                      learning_rate=0.01,
                      reward_decay=0.9,
                      e_greedy=0.9,
                      replace_target_iter=200,
                      memory_size=2000,
                      # output_graph=True
                      )
    main()
    RL.plot_cost()
Ejemplo n.º 27
0
        return 1.0 + PENALTY, dist_

    if dist_ > dist:
        return -1.0 + PENALTY, dist_

    return 0.0 + PENALTY, dist_


if __name__ == "__main__":
    # maze game
    env = Env()
    RL = DeepQNetwork(env.n_actions,
                      env.n_features,
                      learning_rate=0.0001,
                      reward_decay=0.9,
                      e_greedy=0.75,
                      replace_target_iter=2000,
                      memory_size=MEMORYCAPACITY,
                      batch_size=64
                      # output_graph=True
                      )
    RL.restore_model()
    for episode in range(EPS):
        env.build_map()
        value = 0

        for step in range(STEP):
            state = env.state.copy()
            action = RL.choose_action(state)
            env.step(action_space[action])
            state_ = env.state.copy()
            reward, dist = compute_reward(state, state_)
Ejemplo n.º 28
0
class LTDQN(Approach):
    def __init__(self,
                 budget,
                 times,
                 users,
                 n_scope,
                 r_interval=0.01,
                 isTrain=True):
        Approach.__init__(self, budget, times, users)
        self.n_scope = n_scope
        self.state_dim = 8
        self.action_dim = 9
        self.r_interval = r_interval
        if isTrain:
            self.dqn = DeepQNetwork(self.action_dim, self.state_dim)
        else:
            self.dqn = DeepQNetwork(self.action_dim,
                                    self.state_dim,
                                    e_greedy_increment=None)

    def generate_reward(self, action, user):
        if action == 1:
            user.default_single_r += self.r_interval
            if user.default_single_r > 1.:
                user.default_single_r = 1.
        elif action == 2:
            user.default_single_r -= self.r_interval
            if user.default_single_r < 0.:
                user.default_single_r = 0.
        elif action == 3:
            user.default_num += 1
            if user.default_num > self.n_scope:
                user.default_num = self.n_scope
        elif action == 4:
            user.default_num -= 1
            if user.default_num < 1:
                user.default_num = 1
        elif action == 5:
            user.default_single_r += self.r_interval
            if user.default_single_r > 1.:
                user.default_single_r = 1.
            user.default_num += 1
            if user.default_num > self.n_scope:
                user.default_num = self.n_scope
        elif action == 6:
            user.default_single_r += self.r_interval
            if user.default_single_r > 1.:
                user.default_single_r = 1.
            user.default_num -= 1
            if user.default_num < 1:
                user.default_num = 1
        elif action == 7:
            user.default_single_r -= self.r_interval
            if user.default_single_r < 0.:
                user.default_single_r = 0.
            user.default_num += 1
            if user.default_num > self.n_scope:
                user.default_num = self.n_scope
        elif action == 8:
            user.default_single_r -= self.r_interval
            if user.default_single_r < 0.:
                user.default_single_r = 0.
            user.default_num -= 1
            if user.default_num < 1:
                user.default_num = 1

    def simulate(self):
        self.dqn.load()
        for ep in range(1):
            # self.users = self.init_users_list()
            total_benefits = 0.
            total_expense = 0.
            for time in range(self.times):
                total_affected_num = 0
                total_req_num = 0.
                for user in self.users:
                    if user.finished == 0:
                        if self.budget > 0:
                            output = self.dqn.choose_action(user.state)
                            self.generate_reward(output, user)
                            # if user.default_single_r >= 0.5:
                            user.receive_offer(user.default_single_r,
                                               user.default_num, output)
                            # else:
                            #     user.receive_offer(0, user.default_num, output)
                            self.budget -= user.r
                        else:
                            user.receive_offer(0., 0, 0)

                    total_req_num += user.req_num
                    action, benefits, reward, done = user.choose_action()

                    if done:
                        if user.finished == 0:
                            self.budget += user.r
                        user.reset_status()
                        # self.dqn.store_transition(user.state, action, reward, user.state_)
                        user.state = user.state_.copy()
                        # self.dqn.learn()
                    if user.action == len(user.preference) - 1:
                        total_affected_num += 1
                    if benefits > 0:
                        total_benefits += benefits
                    total_expense += benefits / (1. - benefits + 0.001)
                if (time + 1) % self.interval == 0:
                    self.affected_users_num.append(total_affected_num)
                    self.total_benefits.append(total_benefits)
                    self.average_req_num.append(total_req_num /
                                                len(self.users))
                    self.ratio.append(total_expense)
                print(
                    "\rEpisode: %d, Time step: %d, Budget: %f, Benefits: %f" %
                    (ep, time, self.budget, total_benefits),
                    end=' ')
            print()

    def init_users_list(self):
        user_list = []
        arr = np.loadtxt('../dataset/test.txt', delimiter=' ')
        # arr = arr[0:2000, :]  # train
        # print(arr)
        total_cost = 0.
        for row in range(arr.shape[0]):
            data = arr[row, :]
            # print(data[0])
            user = User(row, float(data[0]), data[1:])
            total_cost += user.preference[np.argmax(
                user.preference)] - user.preference[-1]
            user_list.append(user)
        print(len(user_list), total_cost)
        return user_list

    def train(self):
        for ep in range(500):
            self.budget = 50000
            self.users = self.init_users_list()
            # self.users = self.init_users_list()
            self.dqn.epsilon = 0
            total_benefits = 0.
            for time in range(self.times):
                total_affected_num = 0
                total_req_num = 0.
                for user in self.users:
                    if user.finished == 0:
                        if self.budget > 0:
                            output = self.dqn.choose_action(user.state)
                            self.generate_reward(output, user)
                            user.receive_offer(user.default_single_r,
                                               user.default_num, output)
                            self.budget -= user.r
                        else:
                            user.receive_offer(0., 0, 0)

                    total_req_num += user.req_num
                    action, benefits, reward, done = user.choose_action()

                    if done:
                        if user.finished == 0:
                            self.budget += user.r
                        user.reset_status()
                        # print(user.state_, user.state)
                        self.dqn.store_transition(user.state, action, reward,
                                                  user.state_)
                        user.state = user.state_.copy()
                        self.dqn.learn()

                    if user.action == len(user.preference) - 1:
                        total_affected_num += 1
                    if benefits > 0:
                        total_benefits += benefits
                if (time + 1) % self.interval == 0:
                    self.affected_users_num.append(total_affected_num)
                    self.total_benefits.append(total_benefits)
                    self.average_req_num.append(total_req_num /
                                                len(self.users))
                print(
                    "\rEpisode: %d, Time step: %d, Budget: %f, Benefits: %f" %
                    (ep, time, self.budget, total_benefits),
                    end=' ')
                if self.budget <= 0:
                    break
            print()
            self.dqn.save()
Ejemplo n.º 29
0
    # end
    print('over')
    sys.exit()


if __name__ == "__main__":
    ###get environment
    #env = gym.make('HalfCheetah-v2')##HalfCheetah, Ant, Humanoid
    env = TwoLeggedEnv()  #SixLeggedEnv()
    #env = myEnv() #self-defined enviornment

    ###initialize rl_agent
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    print(action_dim)
    if (isTrain):
        model_path = "models/dqn_two_legged"
    else:
        model_path = "models/dqn_two_legged_final"
    rl_agent = DeepQNetwork(model_path,
                            action_dim,
                            state_dim,
                            learning_rate=0.01,
                            reward_decay=0.9,
                            e_greedy=0.9,
                            replace_target_iter=200,
                            memory_size=2000,
                            output_graph=True)
    #parse rl_agent to run the environment
    run_ant(rl_agent)
Ejemplo n.º 30
0
from DQN import DeepQNetwork

dqn = DeepQNetwork(atari_env='SpaceInvaders-v4',
                   state_dimension=[88, 80, 3],
                   action_dimension=6,
                   train_step=4)

dqn.run()