Ejemplo n.º 1
0
	def initialize_policies(self):
		self.Transition = namedtuple('Transition',('state', 'action', 'next_state', 'reward'))
		self.policy_net_agent = DQN(n_feature = self._state_dim)
		self.policy_net_agent.double()
		self.target_net_agent = DQN(n_feature = self._state_dim)
		self.target_net_agent.double()
		self.target_net_agent.load_state_dict(self.policy_net_agent.state_dict())
Ejemplo n.º 2
0
    def __init__(self, config, device, model=False):

        self.device = device

        self.board_size = config.board_size

        self.eps_end = config.eps_end
        self.eps_start = config.eps_start
        self.eps_end = config.eps_end
        self.eps_decay = config.eps_decay
        self.gamma = config.gamma
        self.batch_size = config.batch_size

        # This part is for the network
        if (model != False):
            self.policy_net = torch.load(model)
        else:
            self.policy_net = DQN(config).to(device)

        # Be aware that the config must be the exact same for the loaded model
        self.target_net = DQN(config).to(device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.optimizer = optim.RMSprop(self.policy_net.parameters(),
                                       momentum=config.momentum,
                                       lr=config.lr)
        self.criterion = torch.nn.SmoothL1Loss()
        self.memory = ReplayMemory(config.replay_memory)
        self.steps_done = 0
Ejemplo n.º 3
0
def main():
    env = gym.make('Acrobot-v1')
    gamma = 0.99
    copy_step = 25
    num_states = len(env.observation_space.sample())
    num_actions = env.action_space.n
    hidden_units = [64, 64]
    max_experiences = 10000
    min_experiences = 100
    batch_size = 32
    iter_per_episode = 300

    TrainNet = DQN(num_states, num_actions, hidden_units, gamma, max_experiences, min_experiences, batch_size)
    TargetNet = DQN(num_states, num_actions, hidden_units, gamma, max_experiences, min_experiences, batch_size)
    N = 50
    total_rewards = np.empty(N)
    epsilon = 0.99
    decay = 0.9999
    min_epsilon = 0.08
    for n in range(N):
        epsilon = max(min_epsilon, epsilon * decay)
        total_reward = play_game(env, TrainNet, TargetNet, epsilon, copy_step, iter_per_episode)
        total_rewards[n] = total_reward
        avg_rewards = total_rewards[max(0, n - 100):(n + 1)].mean()
        if n % 5 == 0:
            print("Progress:", int(n/N*100), "episode reward:", total_reward, "eps:", epsilon, "avg reward (last 100):", avg_rewards)
    print("avg reward for last 100 episodes:", avg_rewards)
    make_video(env, TrainNet, 300)
    env.close()
Ejemplo n.º 4
0
    def __init__(self, load_from_previous_model):
        self.policy_net = DQN(STATE_DIMENSION, NUM_ACTIONS).to(self.device)
        self.target_net = DQN(STATE_DIMENSION, NUM_ACTIONS).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.optimizer = optim.RMSprop(self.policy_net.parameters(),
                                       lr=LEARNING_RATE)
        self.replayMemory = ReplayMemory(10000)

        if load_from_previous_model:
            self.load_model()
Ejemplo n.º 5
0
 def initialize_policies(self):
     self.Transition = namedtuple(
         'Transition', ('state', 'action', 'next_state', 'reward'))
     self.policy_net_agent = DQN(n_feature=self._state_dim)
     self.policy_net_agent.double()
     self.target_net_agent = DQN(n_feature=self._state_dim)
     self.target_net_agent.double()
     self.target_net_agent.load_state_dict(
         self.policy_net_agent.state_dict())
     self.optimizer_agent = optim.RMSprop(
         self.policy_net_agent.parameters(),
         lr=self._lr,
         weight_decay=self._weight_decay)
Ejemplo n.º 6
0
    def build_net(self):
        print('Building QNet and targetnet...')
        self.qnet = DQN(self.params, 'qnet', self.params['TB_logpath'])
        self.targetnet = DQN(self.params, 'targetnet',
                             self.params['TB_logpath'])
        self.sess.run(tf.global_variables_initializer())
        saver_dict = {
            'qw1': self.qnet.w1,
            'qb1': self.qnet.b1,
            'qw2': self.qnet.w2,
            'qb2': self.qnet.b2,
            'qw3': self.qnet.w3,
            'qb3': self.qnet.b3,
            'qw4': self.qnet.w4,
            'qb4': self.qnet.b4,
            'qw5': self.qnet.w5,
            'qb5': self.qnet.b5,
            'tw1': self.targetnet.w1,
            'tb1': self.targetnet.b1,
            'tw2': self.targetnet.w2,
            'tb2': self.targetnet.b2,
            'tw3': self.targetnet.w3,
            'tb3': self.targetnet.b3,
            'tw4': self.targetnet.w4,
            'tb4': self.targetnet.b4,
            'tw5': self.targetnet.w5,
            'tb5': self.targetnet.b5,
            'step': self.qnet.global_step
        }
        self.saver = tf.train.Saver(saver_dict)
        self.cp_ops = [
            self.targetnet.w1.assign(self.qnet.w1),
            self.targetnet.b1.assign(self.qnet.b1),
            self.targetnet.w2.assign(self.qnet.w2),
            self.targetnet.b2.assign(self.qnet.b2),
            self.targetnet.w3.assign(self.qnet.w3),
            self.targetnet.b3.assign(self.qnet.b3),
            self.targetnet.w4.assign(self.qnet.w4),
            self.targetnet.b4.assign(self.qnet.b4),
            self.targetnet.w5.assign(self.qnet.w5),
            self.targetnet.b5.assign(self.qnet.b5)
        ]
        self.sess.run(self.cp_ops)

        if self.params['ckpt_file'] is not None:
            print('\x1b[1;30;41m RUN LOAD \x1b[0m')
            self.load()

        print('Networks had been built!')
        sys.stdout.flush()
Ejemplo n.º 7
0
def main():
    env = gym.make(ENV_NAME)
    agent = DQN.DQN(env)
    for episode in range(EPISODE):
        state = env.reset()
        # train
        for step in range(STEP):
            action = agent.egreedy_action(state)
            next_state, reward, done, _ = env.step(action)
            #Define reward
            reward_agent = -1 if done else 0.1
            agent.perceive(state, action, reward, next_state, done)
            state = next_state
            if done:
                break
        if episode % 100 == 0:
            total_reward = 0
            for i in range(TEST):
                env.render()
                action = agent.action(state)
                state, reward, done, _ = env.step(action)
                total_reward += reward
                if done:
                    break
            ave_reward = total_reward / TEST
            print 'episode', episode, 'Evaluation Average Reward:', ave_reward
            if ave_reward >= 200:
                break
Ejemplo n.º 8
0
def main():
    env = gym.make("Boxing-v0")
    height = 84
    width = 84
    channels = 4
    num_actions = 18
    dqn = DQN(AtariNetwork(height, width, channels),
              height * width,
              num_actions,
              epsilon=1.0,
              epsilon_decay=0.999,
              num_stacked=channels,
              learning_rate=0.1)
    memory = MemoryReplay(height * width,
                          num_actions,
                          max_saved=10000,
                          num_stacked=channels)

    for epoch in tqdm(range(1000)):

        # Gain experience
        for _ in range(1):
            s = env.reset()
            s = preprocess(s)
            s = np.array([s, s, s, s])
            for i in range(100):
                # if epoch % 5 == 0:
                #     env.render()
                a = dqn.select_action(np.reshape(s, [1, -1]))
                s_prime, r, t, _ = env.step(np.argmax(a))
                s_prime = preprocess(s_prime)
                s_prime = np.roll(s, 1, axis=0)
                s_prime[0] = np.maximum(s_prime[1], s_prime[0])
                memory.add(s.reshape([-1]), a, r - 1, s_prime.reshape([-1]), t)
                s = s_prime

                if t:
                    break

        #print(epoch, ": ", total_reward)

        # Train on that experience
        # for i in range(min((epoch + 1) * 5, 250)):
        for i in range(25):
            dqn.train(*memory.get_batch())

        dqn.reassign_target_weights()

        if (epoch + 1) % 25 == 0:
            s = env.reset()
            s = preprocess(s)
            s = np.array([s, s, s, s])
            for i in range(100):
                a = dqn.select_greedy_action(np.reshape(s, [1, -1]))
                env.render()
                s_prime, _, t, _ = env.step(np.argmax(a))
                s = np.roll(s, 1, axis=0)
                s[0] = preprocess(s_prime)
                if t:
                    break
Ejemplo n.º 9
0
    def __init__(self, args):

        print("Initialise DQN Agent")

        # Load parameters from user-given arguments
        self.params = params
        self.params['width'] = args['width']
        self.params['height'] = args['height']
        self.params['num_training'] = args['numTraining']

        # Start Tensorflow session
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1)
        self.sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
        self.qnet = DQN(self.params)

        # time started
        self.general_record_time = time.strftime("%a_%d_%b_%Y_%H_%M_%S",
                                                 time.localtime())
        # Q and cost
        self.Q_global = []
        self.cost_disp = 0

        # Stats
        self.cnt = self.qnet.sess.run(self.qnet.global_step)
        self.local_cnt = 0

        self.numeps = 0
        self.last_score = 0
        self.s = time.time()
        self.last_reward = 0.

        self.replay_mem = deque()
        self.last_scores = deque()
Ejemplo n.º 10
0
    def __init__(self, restore=False):
        sess_conf = DQN.tf.ConfigProto()
        sess_conf.gpu_options.allow_growth = True
        self.sess = DQN.tf.Session(config=sess_conf)

        self.Q_main = DQN.DQN(self.sess, name="main")
        self.Q_target = DQN.DQN(self.sess, name="target")

        self.sess.run(DQN.tf.global_variables_initializer())
        self.copy_ops = DQN.get_copy_var_ops(dest_scope_name="target",
                                             src_scope_name="main")
        self.copy()

        if restore:
            self.restore()
            self.copy()
Ejemplo n.º 11
0
def main():
    # For ease
    should_save = False
    should_load = True
    save_path = ".saves/doom_basic_take2_"
    load_path = ".saves/doom_basic_take2_399.ckpt"

    # Doom has an 480x640x3 dimensional observation space and 43 multi discrete action space
    # However, we resize it to 1/4 the size (120, 160)
    env = gym.make('ppaquette/DoomBasic-v0')
    height = 120
    width = 160
    channels = 4
    num_actions = NUM_ACTIONS

    dqn = DQN(DoomNetwork(height, width, channels),
              height * width,
              num_actions,
              num_stacked=channels)
    memory = MemoryReplay(height * width,
                          num_actions,
                          max_saved=10000,
                          num_stacked=channels)

    if should_load:
        dqn.load(load_path)

    for epoch in range(10000):

        # Gain experience
        total_reward = 0
        s = np.zeros([120, 160, 4])
        s_prime = np.zeros([120, 160, 4])
        s[:, :, 0] = process_image(env.reset())
        for i in range(10000):
            a = dqn.select_greedy_action(np.reshape(s, [1, -1]))
            action = DOOM_ACTIONS[a.reshape([-1]) == 1.0]
            s2, r, t, _ = env.step(action.reshape([-1]))
            total_reward += r
            s_prime[:, :, 1:] = s_prime[:, :, :3]
            s_prime[:, :, 0] = process_image(s2)
            memory.add(np.reshape(s, [-1]), a, r, np.reshape(s_prime, [-1]), t)
            env.render()
            s = s_prime

            if t:
                break

        print(epoch, ": ", total_reward, ", ", dqn._epsilon)

        # Train on that experience
        # for i in range(min((epoch + 1) * 5, 250)):
        for i in range(100):
            dqn.train(*memory.get_batch())

        dqn.reassign_target_weights()

        if should_save and (epoch + 1) % 100 == 0:
            dqn.save(save_path + str(epoch) + ".ckpt")
Ejemplo n.º 12
0
 def DQN(observation_shape, action_shape, **params):
     if params.get('noisy', False):
         net = dqn.NoisyDQN(observation_shape, action_shape)
     else:
         net = dqn.DQN(observation_shape, action_shape)
     if params.get('target', False):
         net = dqn.DQNT(net, params['double'])
     return net.to(params.get('device', 'cpu'))
Ejemplo n.º 13
0
    def __init__(self, width, height, numTraining=0):

        # Load parameters from user-given arguments
        self.params = params
        self.params['width'] = width  # Maze width
        self.params['height'] = height  # Maze height
        self.params[
            'num_training'] = numTraining  # Number of games used for training

        # create saves and logs directory
        if not os.path.exists("saves/DQN/"):
            os.makedirs("saves/DQN/")
        if not os.path.exists("logs/"):
            os.makedirs("logs/")

        # get saves directory
        if params["load_file"] is not None and not params[
                "load_file"].startswith("saves/DQN/"):
            params["load_file"] = "saves/DQN/" + params["load_file"]

        # Start Tensorflow session
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5)
        self.sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
        self.qnet = DQN(self.params)  # create DQN

        # time started
        self.general_record_time = time.strftime("%a_%d_%b_%Y_%H_%M_%S",
                                                 time.localtime())
        self.Q_global = []  # max Q-values in the current game
        self.cost_disp = 0  # current loss

        self.cnt = self.qnet.sess.run(
            self.qnet.global_step
        )  # number of steps the model has been trained so far
        self.local_cnt = 0  # number of total steps the algorithm has run

        self.numeps = 0  # current episode
        if params["load_file"] is not None:
            self.numeps = int(params["load_file"].split("_")[-1])
        self.last_score = 0  # Score in the last step
        self.s = time.time()  # time elapsed since beginning of training
        self.last_reward = 0.  # Reward obtained in the last step

        self.replay_mem = deque()  # replay memory used for training

        self.terminal = False  # True if the game in a terminal state

        self.last_score = 0  # Score obtained in the last state
        self.current_score = 0  # Score obtained in the current state
        self.last_reward = 0.  # Reward obtained in the last state
        self.ep_rew = 0  # Cumulative reward obtained in the current game

        self.last_state = None  # Last state
        self.current_state = None  # Current state
        self.last_action = None  # Last action
        self.won = True  # True if the game has been won
        self.delay = 0
        self.frame = 0
Ejemplo n.º 14
0
    def __init__(self, learning_rate=1e-2, restore=False, name="main"):
        self.sess = DQN.tf.Session()
        self.action_value = DQN.DQN(self.sess,
                                    learning_rate=learning_rate,
                                    name=name)
        self.sess.run(DQN.tf.global_variables_initializer())

        if restore:
            self.restore()
Ejemplo n.º 15
0
def main():
    max_episodes = 1000
    replay_buffer = deque()
    with tf.compat.v1.Session() as sess:
        mainDQN = DQN.DQN(sess, input_size, output_size, name="main")
        targetDQN = DQN.DQN(sess, input_size, output_size, name="target")
        tf.compat.v1.global_variables_initializer().run()

        copy_ops = get_copy_var_ops(dest_scope_name="target",
                                    src_scope_name="main")
        sess.run(copy_ops)
        for episode in range(max_episodes):
            e = 1. / ((episode / 10) + 1)
            done = False
            step_count = 0
            state = env.reset()

            while not done:
                if np.random.rand(1) < e:
                    action = env.action_space.sample()
                else:
                    action = np.argmax(mainDQN.predict(state))
                next_state, reward, done, _ = env.step(action)
                if done:
                    reward = -100

                replay_buffer.append((state, action, reward, next_state, done))
                if len(replay_buffer) > REPLAY_MEMORY:
                    replay_buffer.popleft()
                state = next_state
                step_count += 1
                if step_count > 10000:
                    break
            print("Episode: {} steps: {}".format(episode, step_count))
            if step_count > 10000:
                pass
            if episode % 10 == 1:
                for _ in range(50):
                    minibatch = random.sample(replay_buffer, 10)
                    loss, _ = replay_train(mainDQN, targetDQN, minibatch)
                print("Loss : ", loss)
                bot_play(mainDQN)
                sess.run(copy_ops)
        bot_play(mainDQN)
Ejemplo n.º 16
0
    def __init__(self, args):

        print("Initialise DQN Agent")

        # Load parameters from user-given arguments
        self.params = params
        self.params['width'] = args['width']
        self.params['height'] = args['height']
        self.params['num_training'] = args['numTraining']

        # Start Tensorflow session
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3)
        self.sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
        self.qnet = DQN(self.params)

        # Summary Writer
        self.summary = tf.Summary()
        self.wins = deque(maxlen=100)
        self.episodesSoFar = 0

        print(args)

        if (params['save_file']):
            self.writer = tf.summary.FileWriter('logs/model-' +
                                                params['save_file'],
                                                graph=tf.Session().graph)

        self.replay_mem = None

        if (params['load_file']):
            try:
                with open('memories/model-' + params['load_file'], 'r') as f:
                    self.replay_mem = pickle.load(f)
            except:
                pass

        # time started
        self.general_record_time = time.strftime("%a_%d_%b_%Y_%H_%M_%S",
                                                 time.localtime())
        # Q and cost
        self.Q_global = []
        self.cost_disp = 0

        # Stats
        self.cnt = self.qnet.sess.run(self.qnet.global_step)
        self.local_cnt = 0

        self.numeps = 0
        self.last_score = 0
        self.s = time.time()
        self.last_reward = 0.

        if not self.replay_mem:
            self.replay_mem = deque()
        self.last_scores = deque()
Ejemplo n.º 17
0
def test_DQN_init():
    def dummy_cnn(x):
        return np.zeros((1, 64, 54, 30))
    dims = [54*30*64, 256, 512]
    n_actions = 10
    model = DQN(dims, n_actions, dummy_cnn)
    assert model.conv_net != None
    assert model.n_actions == 10
    assert isinstance(model.layers[0], nn.Linear)
    assert model.layers[0].in_features == 54*30*64
    assert model.layers[0].out_features == 256
Ejemplo n.º 18
0
def test_DQN_forward():
    def dummy_cnn(x):
        return torch.zeros((1, 64, 54, 30))
    dims = [54*30*64, 256, 512]
    n_actions = 10
    model = DQN(dims, n_actions, dummy_cnn)
    assert model(None).shape == (1, 10)

# def run_test():
#     test_CNN()

# if __name__ == "__main__":
#     run_test()
Ejemplo n.º 19
0
 def __init__(self, simulation_depth=6, number_of_simulation=1000, policy=None):
     super().__init__()
     '''
     Check Player class in runner.py to get information about useful predefined functions
     e.g. move, get_score, is_empty, step
     '''
     self.root = None
     self.alpha = 0.5
     self.simulation_depth = simulation_depth
     self.number_of_simulation = number_of_simulation
     self.g = Graph(format='png', graph_attr={}, node_attr={'fontsize': '13'})  # visualization graph
     self.node_id = 0
     self.policy = DQN.DQN(6)
     self.policy.load_state_dict(torch.load('checkpoint_left.pth', map_location=device))
Ejemplo n.º 20
0
 def __init__(self,
              gamma,
              epsilon_start,
              epsilon_end,
              epsilon_decay,
              alpha,
              target_update,
              max_iter,
              tau,
              batch_size=16,
              dropout_ratio=0.25):
     self.gamma = gamma
     self.epsilon_start = epsilon_start
     self.epsilon_end = epsilon_end
     self.epsilon_decay = epsilon_decay
     self.alpha = alpha
     self.target_update = target_update
     self.max_iter = max_iter
     self.batch_size = batch_size
     self.dropout_ratio = dropout_ratio
     self.tau = tau
     self.tag = "g" + str(self.gamma) + "e" + str(self.epsilon_decay) + "lr" + str(self.alpha) + "t" \
                + str(self.target_update) + "b" + str(self.batch_size) + "d" + str(self.dropout_ratio) + "tau" + str(self.tau)
     self.memory = ReplayMemory(5000, self.batch_size)
     self.env = gym.make("LunarLander-v2")
     self.n_actions = self.env.action_space.n
     self.device = torch.device(
         "cuda" if torch.cuda.is_available() else "cpu")
     self.policy_net = DQN(self.dropout_ratio)
     self.target_net = DQN(self.dropout_ratio)
     self.policy_net = self.policy_net.float()
     self.target_net = self.target_net.float()
     self.target_net.load_state_dict(self.policy_net.state_dict())
     self.optimizer = torch.optim.Adam(self.policy_net.parameters(),
                                       lr=self.alpha)
     self.loss = MSELoss()
Ejemplo n.º 21
0
    def __init__(self):
        self.graph = Graph()
        self.actionSpace = []
        self.RL = DQN(self.action_size, self.feature_size, output_graph=True)
        self.requests = []
        self.max_request = 20000

        #get the action spaces
        for cl in self.graph.cloudlets:
            for operate in [0, 1, 2]:
                action = (cl, operate)
                self.actionSpace.append(action)
        self.action_size = len(self.actionSpace)

        #get the feature size
        self.feature_size = 5 + self.graph.cloudlet_number * self.graph.web_function_number * 2  #dimension number of the state
        '''
Ejemplo n.º 22
0
    def __init__(self, args):

        print("Initialise DQN Agent")

        # Load parameters from user-given arguments
        self.params = params
        self.params['width'] = args['width']
        self.params['height'] = args['height']
        self.params['num_training'] = args['numTraining']

        # Start Tensorflow session
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1)
        self.sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
        self.qnet = DQN(self.params)

        # time started
        self.general_record_time = time.strftime("%a_%d_%b_%Y_%H_%M_%S",
                                                 time.localtime())
        # Q and cost
        self.Q_global = []
        self.cost_disp = 0

        # Stats
        self.cnt = self.qnet.sess.run(self.qnet.global_step)
        self.local_cnt = 0

        self.numeps = 0
        self.last_score = 0
        self.s = time.time()
        self.last_reward = 0.

        ##self.replay_mem = deque()
        ## used priority buffer proportional
        self.last_priority = []
        self.cur_priority = 1
        self.last_priority.append(self.cur_priority)
        self.delta = 0
        self.explore = True
        self.replay_mem = proportional.Experience(self.params['mem_size'],
                                                  self.params['batch_size'],
                                                  self.params['alpha'])
        self.last_scores = deque()
Ejemplo n.º 23
0
    def __init__(self, player, args):
        self.player = player
        print("Initialise DQN Agent")

        self.params =  params 
        self.params['num_training'] = args['num_training']

        self.params.update(args)
        self.params['width'] = self.params['layout'].width
        self.params['height'] = self.params['layout'].height
        self.save_file = self.params['save_file'] + '-' + str(player.index)
        self.load_file = self.params['load_file'] + '-' + str(player.index)
        # self.save_file = self.params['save_file']
        # self.load_file = self.params['load_file']

        # Start Tensorflow session
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1)
        self.sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

        dqnParams = self.params.copy()
        dqnParams['load_file'] = self.load_file
        self.qnet = DQN(dqnParams)

        # time started
        self.general_record_time = time.strftime(
            "%a_%d_%b_%Y_%H_%M_%S", time.localtime())
        # Q and cost
        self.Q_global = []
        self.cost_disp = 0

        # Stats
        self.cnt = self.qnet.sess.run(self.qnet.global_step)
        self.local_cnt = 0

        self.numeps = 0
        self.last_score = 0
        self.s = time.time()
        self.last_reward = 0.

        self.replay_mem = deque()
        self.last_scores = deque()
Ejemplo n.º 24
0
def main():
    # initialize OpenAI Gym env and dqn agent
    env = gym.make(ENV_NAME)
    agent = DQN.DQN(env)
    SIZE = env.SIZE

    agent.copyWeightsToTarget()
    saver = tf.train.import_meta_graph('model/model-7500.meta')
    with tf.Session() as sess:
        new_saver = tf.train.import_meta_graph('model/model-7500.meta')
        new_saver.restore(sess,tf.train.latest_checkpoint('model/'))
    total_reward = 0
    for i in range(TEST):
        state = env.reset()
        state = np.reshape(state, [-1])
        camp = -1
        state = np.append(state, camp)

        for j in range(STEP):
            env.render()
            action = agent.action(state)  # direct action for test

            action = [math.floor(action / SIZE), action % SIZE, camp]
            state, reward, done, _ = env.step(action)
            state = np.reshape(state, [-1])
            if j % 2 == 0:
                camp = 1
            else:
                camp = -1
            state = np.append(state, camp)

            total_reward += reward
            time.sleep(0.5)
            if done:
                env.render()
                print('done')
                time.sleep(3)
                break
    ave_reward = total_reward / TEST
    print('episode: ', episode, 'Evaluation Average Reward:', ave_reward)
Ejemplo n.º 25
0
def main():
    """
        Main script.
        Default environment: CartPole-v0
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='CartPole-v0')
    parser.add_argument('--gamma', type=float, default=0.95)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--num_episodes', type=int, default=1e2)
    parser.add_argument('--epsilon', type=float, default=0.1)
    parser.add_argument('--min_epsilon', type=float, default=0.01)
    parser.add_argument('--epsilon_decay', type=float, default=0.995)
    parser.add_argument('--batch_size', type=int, default=32)
    args = parser.parse_args()

    with tf.Session() as sess:
        agent = DQN(args, sess)
        print("Training agent...\n")
        agent.train()
        print("Training completed successfully.\nPrinting Results.\n")
        agent.print_results()
Ejemplo n.º 26
0
    def __init__(self, index):

        print("Initialise DQN Agent")

        # Load parameters from user-given arguments
        params[
            'load_file'] = '/projectnb/dl-course/nidhi/project/taken_swarnim/DL_RL_CollisionAvoidance/PacmanDQN/saves/' + params[
                'ghosts_models'][index - 1]
        self.params = params
        self.params['width'] = 20
        self.params['height'] = 11
        self.params['num_training'] = 400
        # TODO: make this dynamic - for different ghosts
        self.index = index
        # Start Tensorflow session
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1)
        self.sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
        self.qnet = DQN(self.params)

        # time started
        self.general_record_time = time.strftime("%a_%d_%b_%Y_%H_%M_%S",
                                                 time.localtime())
        # Q and cost
        self.Q_global = []
        self.cost_disp = 0

        # Stats
        self.cnt = self.qnet.sess.run(self.qnet.global_step)
        self.local_cnt = 0

        self.numeps = 0
        self.last_score = 0
        self.s = time.time()
        self.last_reward = 0.
        self.lastdist = 15

        self.replay_mem = deque()
        self.last_scores = deque()
Ejemplo n.º 27
0
def main():
    env = gym.make("CartPole-v0")
    # CartPole has an 4 dimensional observation space and 2 dimensional action space
    dqn = DQN(CartPoleNetwork(), 4, 2)
    memory = MemoryReplay(4, 2, max_saved=10000)
    steps = 0
    for epoch in range(1000):

        # Gain experience
        s = env.reset()
        for i in range(200):
            a = dqn.select_action(np.reshape(s, [1, -1]))
            s_prime, r, t, _ = env.step(np.argmax(a))
            memory.add(s, a, 1 * (-1 * t), s_prime, t)
            s = s_prime

            # Train on that experience
            dqn.train(*memory.get_batch())
            steps += 1
            if steps % 25 == 0:
                dqn.reassign_target_weights()

            if t:
                break

        s = env.reset()
        greedy_success = 0
        for i in range(200):
            a = dqn.select_greedy_action(np.reshape(s, [1, -1]))
            #env.render()
            s, _, t, _ = env.step(np.argmax(a))
            greedy_success += 1
            if t:
                break

        print(epoch, ", ", greedy_success)
Ejemplo n.º 28
0
def main(game):

    # Hyper Parameters
    epsilon = 1
    epsilon_l_bound = 0.01
    epsilon_steps = (epsilon - epsilon_l_bound) / 100000
    epochs = 10000
    game_over = False
    env = gym.make(game)
    discount_factor = 0.99
    batch_size = 32
    grid_size = 84
    num_state_frames = 4
    replay_memory_size = 100000
    target_update_steps = 5000

    # pick a sample state to know the shape of input image(since different games have different image size)
    # and possible action numbers
    sample_state = env.reset()
    num_act = env.action_space.n

    sess = tf.Session()

    # will use this object when processing images
    pre_proc = pre_processor.pre_processor(sample_state, grid_size)

    # separate scopes so that we can separately update weights
    major_scope_name = 'Major'
    target_scope_name = 'Target'
    major = DQN.DQN(sess, major_scope_name, num_act, grid_size,
                    num_state_frames)
    target = DQN.DQN(sess, target_scope_name, num_act, grid_size,
                     num_state_frames)

    init = tf.global_variables_initializer()
    sess.run(init)

    # Since both major and target networks are randomly initialized,
    # we need to make the weights of both network the same from the first
    update_target(major_scope_name, target_scope_name, sess)

    cur_epoch = 0
    scores = []
    replay_memory = []

    # current is used to fill in data into replay memory
    current = 0

    for i in range(epochs):

        epsilon = 1
        step = 0
        cur_epoch_reward = 0

        # since we are expecting image input, which is stationary by one of itself, we need to take in several images at once
        # so that we include movement information. num_state_frames is the number of images we take in at once
        current_state = np.zeros([1, grid_size, grid_size, num_state_frames])

        # st means state of one image input. What we are going to put into the network is sum of it, current_state
        st = env.reset()
        processed_st = pre_proc.process(sess, st)
        game_over = False

        # since we do not have enough frames until we do some actions, we will temporarily use the same states
        # to populate current_state
        for i in range(num_state_frames):
            current_state[:, :, :, i] = processed_st

        while not game_over:

            # epsilon greedy policy
            if epsilon > np.random.rand(1):
                act = np.random.randint(num_act)
            else:
                act = np.argmax(major.get_q_val(current_state))

            next_frame, reward, game_over, info = env.step(act)

            cur_epoch_reward += reward
            next_state = np.zeros([1, grid_size, grid_size, num_state_frames])

            # Insert newly gained image state into next_state
            next_state[:, :, :, :num_state_frames - 1] = current_state[:, :, :,
                                                                       1:]
            next_state[:, :, :, 3] = pre_proc.process(sess, next_frame)

            if current < replay_memory_size:
                replay_memory.append(
                    [current_state[0], act, reward, next_state[0], game_over])
            else:
                replay_memory[current % replay_memory_size] = [
                    current_state[0], act, reward, next_state[0], game_over
                ]

            current += 1

            # Randomly sample batch from replay memory to train networks
            batch = random.sample(replay_memory, min(batch_size, current))

            state_batch, act_batch, reward_batch, next_state_batch, game_over_batch = map(
                np.array, zip(*batch))

            q_next_val = target.get_q_val(next_state_batch)
            target_loss_args = reward_batch + (
                1 - game_over_batch) * discount_factor * np.amax(q_next_val,
                                                                 axis=1)

            # Train major nn
            major.update(state_batch, target_loss_args, act_batch)

            # decrease epsilon
            if epsilon >= epsilon_l_bound:
                epsilon -= epsilon_steps

            # update target network when specific amout of steps are passed
            if step % target_update_steps == 0:
                update_target(major_scope_name, target_scope_name, sess)

            current_state = next_state
            step += 1

        cur_epoch += 1

        scores.append(cur_epoch_reward)
        print("epoch : {}, step : {}, score : {}, avg_score : {}".format(
            cur_epoch, step, cur_epoch_reward, np.average([scores])))
Ejemplo n.º 29
0
    for (x, y) in valid:
        t = (int((y + 1.5) * BOX), int((x + 1.5) * BOX))
        pygame.draw.circle(surf, (0, 255, 0), t, int(BOX / 3))

    if isinstance(last_pos, tuple):
        (x, y) = last_pos
        t = (int((y + 1.5) * BOX), int((x + 1.5) * BOX))
        pygame.draw.circle(surf, (255, 0, 0), t, int(BOX / 8))

    pygame.display.flip()


if __name__ == "__main__":
    me_first = -1  # 1表示ai先手;-1表示人先手
    ai = DQN(me_first)

    game = Game()
    running = True

    if me_first == 1:
        step = 1
        ai_color = 1
        human_color = -1  # 如果人是后手,那么人对应的颜色就是白色
    else:
        step = 0
        ai_color = -1
        human_color = 1   # 如果是人先手,那么人对应的颜色就是黑色

    grid = 0
    while running:
Ejemplo n.º 30
0
def main(args):

    json_file = args.json_file
    json_files_train = args.json_files_train

    json_file_policy_train = args.json_file_PA_train
    json_file_policy_CS_train = args.json_file_CS_train

    with open('./config/deployment/' + json_file + '.json', 'r') as f:
        options = json.load(f)
    with open('./config/policy/' + json_file_policy_train + '.json', 'r') as f:
        options_policy = json.load(f)
    with open('./config/policy/' + json_file_policy_CS_train + '.json',
              'r') as f:
        options_CS = json.load(f)
    if not options_policy['cuda']:
        os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
    import tensorflow as tf
    for json_file_train in json_files_train:
        with open('./config/deployment/' + json_file_train + '.json',
                  'r') as f:
            options_train = json.load(f)
        included_train_episodes = []
        tot_train_episodes = int(options_train['simulation']['total_samples'] /
                                 options_train['train_episodes']['T_train'])
        N = options['simulation']['N']
        # Multi channel scenario, M denotes number of channels.
        if 'M' in options['simulation']:
            M = options['simulation']['M']
        else:
            M = 1
        # if N <=20:
        #     for i in range(tot_train_episodes+1):
        #         if i<=15 or i%5==0:
        #             included_train_episodes.append(i)
        # else:
        included_train_episodes.append(tot_train_episodes)

        train_tot_simulations = options_train['simulation']['num_simulations']
        tot_test_episodes = int(options['simulation']['total_samples'] /
                                options['train_episodes']['T_train'])
        inner_train_networks = [[]] * tot_test_episodes
        for i in range(tot_test_episodes):
            inner_train_networks[i] = 0
            # if options['simulation']['test_include'] == 'all':
            #     inner_train_networks[i] = 0#list(range(train_tot_simulations))
            # else:
            #     inner_train_networks[i] = list(np.random.randint(0,train_tot_simulations,options['simulation']['test_include']))
        ## Kumber of samples
        total_samples = options['simulation']['total_samples']

        N = options['simulation']['N']

        # simulation parameters
        train_episodes = options['train_episodes']
        mobility_params = options['mobility_params']
        mobility_params['alpha_angle'] = options['mobility_params'][
            'alpha_angle_rad'] * np.pi  #radian/sec
        #Some defaults
        Pmax_dB = 38.0 - 30
        Pmax = np.power(10.0, Pmax_dB / 10)
        n0_dB = -114.0 - 30
        noise_var = np.power(10.0, n0_dB / 10)

        for ep in included_train_episodes:
            #
            np.random.seed(500 + N + ep)
            file_path = './simulations/channel/%s_network%d' % (json_file, 0)
            data = np.load(file_path + '.npz')

            H_all = data['arr_1']
            H_all_2 = []
            for i in range(total_samples):
                H_all_2.append(H_all[i]**2)

            weights = []
            for loop in range(total_samples):
                weights.append(np.array(np.ones(N)))

            time_calculating_strategy_takes = []

            # Virtual neighbor placer

            policy = DQN.DQN(options,
                             options_policy,
                             N,
                             M,
                             Pmax,
                             noise_var,
                             seed=500 + N + ep)

            ## Our JSAC version uses a linear quantizer.
            strategy_translation = np.zeros(policy.power_levels)
            strategy_translation[0] = 0.0  # Tx power 0
            # Calculate steps in dBm
            for i in range(1, policy.power_levels - 1):
                strategy_translation[i] = i * (Pmax /
                                               (policy.power_levels - 1))
            strategy_translation[-1] = Pmax

            # strategy_translation = np.zeros(policy.power_levels)
            # strategy_translation[0] = 0.0 # Tx power 0
            # Pmin_dB = 10.0-30
            # # Calculate steps in dBm
            # strategy_translation_dB_step = (Pmax_dB-Pmin_dB)/(policy.power_levels-2)
            # for i in range(1,policy.power_levels-1):
            #     strategy_translation[i] = np.power(10.0,((Pmin_dB+(i-1)*strategy_translation_dB_step))/10)
            # strategy_translation[-1] = Pmax

            time_calculating_strategy_takes = []
            time_optimization_at_each_slot_takes = []
            sum_rate_distributed_policy_episode = []
            p_strategy_all_apisode = []
            i_train = 0
            #        for i_train in range(len(inner_train_networks[0])):
            sum_rate_distributed_policy = []
            sum_rate_list_distributed_policy = collections.deque([], 2)
            # Initial allocation is just random
            p_central = Pmax * np.random.rand(N)
            p_strategy = np.array(
                p_central)  # strategy is a completely different object
            p_strategy_current = np.array(p_strategy)

            alpha_central = np.zeros((N, M))
            for k in range(N):
                alpha_central[k, np.random.randint(M)] = 1
            alpha_strategy = np.array(
                alpha_central)  # strategy is a completely different object
            alpha_strategy_current = np.array(alpha_strategy)

            alpha_int_central = np.where(alpha_central == 1)[1].astype(int)
            alpha_int_strategy = np.array(
                alpha_central)  # strategy is a completely different object
            alpha_int_strategy_current = np.array(alpha_int_strategy)

            # current CSI used to calculate the power allocation
            current_csi = 0
            previous_csi = 0

            p_strategy_all = []
            alpha_strategy_all = []
            alpha_int_strategy_all = []

            with tf.Session() as sess:
                sess.run(policy.init)
                policy.initialize_updates(sess)
                # Start iterating voer time slots
                for sim in range(total_samples):
                    # save an instance per training episode for testing purposes.
                    if (sim % train_episodes['T_train'] == 0):
                        train_network_idx = i_train  #inner_train_networks[int(sim /train_episodes['T_train'])][i_train]
                        model_destination = (
                            './simulations/sumrate/policy/%s_%s_%s_network%d_episode%d.ckpt'
                            % (json_file_train, json_file_policy_train,
                               json_file_policy_CS_train, train_network_idx,
                               ep)).replace('[', '').replace(']', '')
                        policy.load(sess, model_destination)
                        i_train += 1
                        i_train = i_train % train_tot_simulations

                    # If at least one time slot passed to get experience
                    if (sim % train_episodes['T_train'] > 1):
                        # Each agent picks its strategy.
                        for agent in range(N):
                            current_local_state = policy.local_state(
                                sim, agent, p_strategy_all, alpha_strategy_all,
                                H_all_2, sum_rate_list_distributed_policy,
                                weights)
                            a_time = time.time()
                            strategy = policy.act_noepsilon(
                                sess, current_local_state, sim)
                            time_calculating_strategy_takes.append(
                                time.time() - a_time)

                            # Pick the action
                            p_strategy[agent] = strategy_translation[
                                strategy % policy.power_levels]
                            alpha_strategy[agent, :] = np.zeros(M)
                            alpha_strategy[agent,
                                           strategy // policy.power_levels] = 1
                            alpha_int_strategy[
                                agent] = strategy // policy.power_levels

                            # Add current state to the short term memory to observe it during the next state
                            policy.previous_state[
                                agent, :] = current_local_state
                            policy.previous_action[agent] = strategy

                    if (sim % train_episodes['T_train'] < 2):
                        p_strategy = np.random.rand(N)
                        alpha_strategy = np.zeros((N, M))
                        for k in range(N):
                            alpha_strategy[k, np.random.randint(M)] = 1
                        alpha_int_strategy = np.where(
                            alpha_strategy == 1)[1].astype(int)

                    p_strategy_current = np.array(p_strategy)
                    alpha_strategy_current = np.array(alpha_strategy)
                    alpha_int_strategy_current = np.array(
                        alpha_int_strategy).astype(int)
                    for m in range(M):
                        policy.prev_suminterferences[:, m] = np.matmul(
                            H_all_2[sim][:, :, m], alpha_strategy[:, m] *
                            p_strategy) - (H_all_2[sim][:, :, m].diagonal() *
                                           alpha_strategy[:, m] *
                                           p_strategy) + noise_var
                    # sims_pos_p[np.where(p_strategy_current>0)] = sim

                    sum_rate_list_distributed_policy.append(
                        pb.reward_helper(H_all[sim], p_strategy,
                                         alpha_strategy, noise_var, Pmax))

                    weights.append(np.array(np.ones(N)))
                    sum_rate_distributed_policy.append(
                        pb.sumrate_multi_weighted_clipped(
                            H_all[sim], p_strategy, alpha_strategy, noise_var,
                            weights[sim]))

                    p_strategy_all.append(p_strategy_current)
                    alpha_strategy_all.append(alpha_strategy_current)
                    alpha_int_strategy_all.append(alpha_int_strategy_current)
                    if (sim % 2500 == 0):
                        print('Test time %d' % (sim))
            sum_rate_distributed_policy_episode.append(
                copy.copy(sum_rate_distributed_policy))
            p_strategy_all_apisode.append(copy.copy(p_strategy_all))

            # End Train Phase
            np_save_path = './simulations/sumrate/test/%s_%s_%s_%s_episode%d.ckpt' % (
                json_file, json_file_train, json_file_policy_train,
                json_file_policy_CS_train, ep)
            print(np_save_path)
            np.savez(np_save_path, options, options_policy,
                     sum_rate_distributed_policy_episode,
                     p_strategy_all_apisode,
                     time_optimization_at_each_slot_takes,
                     time_calculating_strategy_takes, included_train_episodes,
                     inner_train_networks)