Ejemplo n.º 1
0
def run_random(f_result, p_matrix=P_DISTINCT_MATRIX, fileName='log_random'):
    # step 1: init BrainDQN
    env = Environment(p_matrix)
    brain = RandomPolciy()

    fileName = fileName

    action = np.zeros(int(ACTION_SIZE))
    action[0] = 1
    action_env = _process(action)

    observation, reward, terminal = env.step(action_env)

    count = 0
    total = 0
    total += reward

    start_time = time.time()

    f = open(fileName, 'w')
    for i in range(T_THRESHOLD):

        count = i + 1

        action = brain.getAction()
        action_env = _process(action)

        observation, reward, terminal = env.step(action_env)

        total += reward
        duration = time.time() - start_time

        if count % PERIOD == 0:
            accum_reward = total / float(count)
            duration = time.time() - start_time
            f.write(
                'Index %d: accu_reward is %f, action is: %s and time duration is %f'
                % (count, accum_reward, str(action_env), duration))
            f.write('\n')

    f.close()

    duration = time.time() - start_time
    count = i + 1
    accum_reward = total / float(count)
    duration = time.time() - start_time
    f_result.write('Random final accu_reward is %f and time duration is %f\n' %
                   (accum_reward, duration))
Ejemplo n.º 2
0
def run_random(f_result, p_matrix=P_DISTINCT_MATRIX, fileName='log_random'):
    # step 1: init BrainDQN
    env = Environment(p_matrix)
    brain = RandomPolciy()

    fileName = fileName

    action = np.zeros(int(ACTION_SIZE))
    action[0] = 1
    action_env = _process(action)

    observation, reward, terminal = env.step(action_env)

    count = 0
    total = 0
    total += reward

    start_time = time.time()

    f = open(fileName, 'w')
    for i in range(T_THRESHOLD):

        count = i + 1

        action = brain.getAction()
        action_env = _process(action)

        observation, reward, terminal = env.step(action_env)

        total += reward
        duration = time.time() - start_time

        if count % PERIOD == 0:
            accum_reward = total / float(count)
            duration = time.time() - start_time
            f.write('Index %d: accu_reward is %f, action is: %s and time duration is %f' % (
                count, accum_reward, str(action_env), duration))
            f.write('\n')

    f.close()

    duration = time.time() - start_time
    count = i + 1
    accum_reward = total / float(count)
    duration = time.time() - start_time
    f_result.write('Random final accu_reward is %f and time duration is %f\n' % (accum_reward, duration))
Ejemplo n.º 3
0
def run_whittleIndex(f_result,
                     p_matrix=P_DISTINCT_MATRIX,
                     fileName='log_whittle'):

    #fileName = fileName

    # f = open(fileName, 'w')
    #
    start_time = time.time()
    accum_reward = 0

    for j in range(T_TIMES):

        total = 0

        gamma = GAMMA

        env = Environment(p_matrix)

        print p_matrix

        brain = WhittleIndex(B, gamma, p_matrix)

        action = [i for i in range(N_SENSING)]

        for i in range(T_EVAL):

            observation, reward, terminal = env.step(action)
            total += reward * (gamma**i)

            action = brain.getAction(action, observation, 0)

            count = i + 1

            # if count % PERIOD == 0:
            #     accum_reward = total
            #     duration = time.time() - start_time
            #     f.write('Index %d: accu_reward is %f, action is: %s and time duration is %f' % (
            #         count, accum_reward, str(action), duration))
            #     f.write('\n')

        # f.close()

        duration = time.time() - start_time
        count = i + 1
        accum_reward += total
        duration = time.time() - start_time
        f_result.write(
            'Whittle Index final accu_reward is %f and time duration is %f\n' %
            (total, duration))

    return accum_reward
Ejemplo n.º 4
0
def run_whittleIndex(f_result, p_matrix=P_DISTINCT_MATRIX, fileName='log_whittle'):

    #fileName = fileName

    # f = open(fileName, 'w')
    #
    start_time = time.time()
    accum_reward = 0

    for j in range(T_TIMES):

        total = 0

        gamma = GAMMA

        env = Environment(p_matrix)

        print p_matrix

        brain = WhittleIndex(B, gamma, p_matrix)

        action = [i for i in range(N_SENSING)]



        for i in range(T_EVAL):

            observation, reward, terminal = env.step(action)
            total += reward*(gamma**i)

            action = brain.getAction(action, observation, 0)

            count = i + 1

            # if count % PERIOD == 0:
            #     accum_reward = total
            #     duration = time.time() - start_time
            #     f.write('Index %d: accu_reward is %f, action is: %s and time duration is %f' % (
            #         count, accum_reward, str(action), duration))
            #     f.write('\n')

        # f.close()

        duration = time.time() - start_time
        count = i + 1
        accum_reward += total
        duration = time.time() - start_time
        f_result.write('Whittle Index final accu_reward is %f and time duration is %f\n' % (total, duration))

    return accum_reward
Ejemplo n.º 5
0
    def target_network_eval(self, fileName, p_matrix, f_result):
        env = Environment(p_matrix)
        action = np.zeros(int(ACTION_SIZE))
        action[0] = 1
        action_env = self.process(action)

        observation, reward, terminal = env.step(action_env)

        currentState = self.setInitState(observation)

        count = 0
        total = 0

        total += reward

        epsilon = INITIAL_EPSILON

        print("Start target evaluation")

        start_time = time.time()
        f = open(fileName, 'w')

        while count < T_EVAL:
            count += 1

            if count <= 10:
                action = np.zeros(int(ACTION_SIZE))
                action_index = random.randrange(ACTION_SIZE)
                action[int(action_index)] = 1

            else:

                action = self.get_target_action(currentState, epsilon, count)

            action_env = self.process(action)

            if count > 15 and count < 50:
                print('observation')
                print(observation)

                print('action')
                print(action_env)

            observation, reward, terminal = env.step(action_env)

            total += reward

            nextState = np.concatenate(
                (currentState[CHANNEL_SIZE:], observation))

            currentState = nextState

            if (count + 1) % PERIOD == 0:
                duration = time.time() - start_time
                accum_reward = total / float(count + 1)

                f.write(
                    'Index %d: accu_reward is %f, action is: %s and time duration is %f'
                    % (count + 1, accum_reward, str(action_env), duration))
                f.write('\n')

        f.close()

        duration = time.time() - start_time
        accum_reward = total / float(count)
        f_result.write(
            'Async Qlearing using target final accu_reward is %f, and time duration is %f\n'
            % (accum_reward, duration))

        print("target evaluation ends")
Ejemplo n.º 6
0
    def q_learner_thread(self, thread_id, fileName, p_matrix, f_result):

        env = Environment(p_matrix)
        action = np.zeros(int(ACTION_SIZE))
        action[0] = 1
        action_env = self.process(action)

        observation, reward, terminal = env.step(action_env)

        currentState = self.setInitState(observation)

        state_batch = []
        y_batch = []
        action_batch = []
        nextState_batch = []
        reward_batch = []
        terminal_batch = []
        count = 0
        total = 0

        total += reward

        epsilon = INITIAL_EPSILON
        final_epsilon = self.sample_final_epsilon()

        print("Start thread %d" % thread_id)
        time.sleep(3 * thread_id)

        start_time = time.time()
        f = open(fileName, 'w')

        while count < T_THRESHOLD:
            count += 1
            state_batch.append(currentState)
            action = self.getAction(currentState, epsilon)
            action_batch.append(action)
            action_env = self.process(action)

            observation, reward, terminal = env.step(action_env)

            total += reward

            reward_batch.append(reward)
            terminal_batch.append(terminal)

            nextState = np.concatenate(
                (currentState[CHANNEL_SIZE:], observation))
            nextState_batch.append(nextState)
            currentState = nextState

            if count % ASYNC_UPDATE_INTERVAL == 0:

                q_values_batch = self.q_values_T.eval(
                    session=self.session,
                    feed_dict={self.state_placeholder_T: nextState_batch})

                for i in range(ASYNC_UPDATE_INTERVAL):

                    if terminal_batch[i]:
                        y_batch.append(reward_batch[i])
                    else:
                        y_batch.append(reward_batch[i] +
                                       GAMMA * np.max(q_values_batch[i]))

                self.train_op.run(session=self.session,
                                  feed_dict={
                                      self.state_placeholder: state_batch,
                                      self.action_placeholder: action_batch,
                                      self.y_placeholder: y_batch
                                  })
                state_batch = []
                y_batch = []
                action_batch = []
                nextState_batch = []
                reward_batch = []
                terminal_batch = []

            if count % TARGET_UPDATE_INTERVAL == 0:
                self.session.run(self.updateTargetNetwork)

            # change episilon
            if epsilon > final_epsilon and count > OBSERVE:
                epsilon -= (INITIAL_EPSILON - final_epsilon) / EXPLORE

            if (count + 1) % PERIOD == 0:
                duration = time.time() - start_time
                accum_reward = total / float(count + 1)

                f.write(
                    'Index %d: accu_reward is %f, action is: %s and time duration is %f'
                    % (count + 1, accum_reward, str(action_env), duration))
                f.write('\n')

        f.close()

        duration = time.time() - start_time
        accum_reward = total / float(count)
        f_result.write(
            'Async Qlearing of thread %d final accu_reward is %f, and time duration is %f\n'
            % (thread_id, accum_reward, duration))

        print("thread %d end" % thread_id)
Ejemplo n.º 7
0
    def q_learner_thread(self, thread_id, fileName, p_matrix, f_result):

        env = Environment(p_matrix)
        action = np.zeros(int(ACTION_SIZE))
        action[0] = 1
        action_env = self.process(action)

        observation, reward, terminal = env.step(action_env)

        currentState = self.setInitState(observation)

        state_batch = []
        y_batch = []
        action_batch = []
        nextState_batch = []
        reward_batch = []
        terminal_batch = []
        count = 0
        total = 0

        total += reward

        epsilon = INITIAL_EPSILON
        final_epsilon = self.sample_final_epsilon()

        print("Start thread %d" % thread_id)
        time.sleep(3 * thread_id)

        start_time = time.time()
        f = open(fileName, 'w')

        while count < T_THRESHOLD:
            count += 1
            state_batch.append(currentState)
            action = self.getAction(currentState, epsilon)
            action_batch.append(action)
            action_env = self.process(action)

            observation, reward, terminal = env.step(action_env)

            total += reward

            reward_batch.append(reward)
            terminal_batch.append(terminal)

            nextState = np.concatenate((currentState[CHANNEL_SIZE:], observation))
            nextState_batch.append(nextState)
            currentState = nextState

            if count % ASYNC_UPDATE_INTERVAL == 0:

                q_values_batch = self.q_values_T.eval(session=self.session,
                                                      feed_dict={self.state_placeholder_T: nextState_batch})

                for i in range(ASYNC_UPDATE_INTERVAL):

                    if terminal_batch[i]:
                        y_batch.append(reward_batch[i])
                    else:
                        y_batch.append(reward_batch[i] + GAMMA * np.max(q_values_batch[i]))

                self.train_op.run(session=self.session,
                                  feed_dict={self.state_placeholder: state_batch, self.action_placeholder: action_batch,
                                             self.y_placeholder: y_batch})
                state_batch = []
                y_batch = []
                action_batch = []
                nextState_batch = []
                reward_batch = []
                terminal_batch = []

            if count % TARGET_UPDATE_INTERVAL == 0:
                self.session.run(self.updateTargetNetwork)

            # change episilon
            if epsilon > final_epsilon and count > OBSERVE:
                epsilon -= (INITIAL_EPSILON - final_epsilon) / EXPLORE

            if (count + 1) % PERIOD == 0:
                duration = time.time() - start_time
                accum_reward = total / float(count + 1)

                f.write('Index %d: accu_reward is %f, action is: %s and time duration is %f' % (
                    count + 1, accum_reward, str(action_env), duration))
                f.write('\n')

        f.close()

        duration = time.time() - start_time
        accum_reward = total / float(count)
        f_result.write('Async Qlearing of thread %d final accu_reward is %f, and time duration is %f\n' % (
            thread_id, accum_reward, duration))

        print("thread %d end" % thread_id)
Ejemplo n.º 8
0
    def target_network_eval(self, fileName, p_matrix, f_result):
        env = Environment(p_matrix)
        action = np.zeros(int(ACTION_SIZE))
        action[0] = 1
        action_env = self.process(action)

        observation, reward, terminal = env.step(action_env)

        currentState = self.setInitState(observation)

        count = 0
        total = 0

        total += reward

        epsilon = INITIAL_EPSILON

        print("Start target evaluation")

        start_time = time.time()
        f = open(fileName, 'w')

        while count < T_EVAL:
            count += 1

            if count <= 10:
                action = np.zeros(int(ACTION_SIZE))
                action_index = random.randrange(ACTION_SIZE)
                action[int(action_index)] = 1

            else:

                action = self.get_target_action(currentState, epsilon, count)

            action_env = self.process(action)

            if count > 15 and count < 50:
                print('observation')
                print(observation)

                print('action')
                print(action_env)

            observation, reward, terminal = env.step(action_env)

            total += reward

            nextState = np.concatenate((currentState[CHANNEL_SIZE:], observation))

            currentState = nextState

            if (count + 1) % PERIOD == 0:
                duration = time.time() - start_time
                accum_reward = total / float(count + 1)

                f.write('Index %d: accu_reward is %f, action is: %s and time duration is %f' % (
                    count + 1, accum_reward, str(action_env), duration))
                f.write('\n')

        f.close()

        duration = time.time() - start_time
        accum_reward = total / float(count)
        f_result.write(
            'Async Qlearing using target final accu_reward is %f, and time duration is %f\n' % (accum_reward, duration))

        print("target evaluation ends")
Ejemplo n.º 9
0
def run_test(f_result, p_matrix, fileName='log_q_table_suff', history=AGENT_STATE_WINDOWS_SIZE):
    # all_states_list = tuple(product(range(-1, 2), repeat=N_CHANNELS))
    all_actions_list = tuple(product(range(0, N_CHANNELS), repeat=N_SENSING))
    # all_observations_list = tuple(product(all_states_list,repeat=AGENT_STATE_WINDOWS_SIZE))

    # p_matrix = [[(0.6, 0.4), (0.2, 0.8)]] * N_CHANNELS

    env = Environment(p_matrix)

    init_state = set_init_state(p_matrix)

    q_agent = QAgent(state_transition_function, init_state, all_actions_list)

    total = 0

    action_evn = [i for i in range(N_SENSING)]  # inital action

    observation, reward, terminal = env.step(action_evn)
    total += reward

    fileName = fileName
    f = open(fileName, 'w')

    start_time = time.time()

    prev_value_dict = {}
    count_cvg = 0

    for i in range(T_THRESHOLD):
        count = i + 1
        observation = tuple(observation.tolist())
        action, prev_value_dict, count_cvg = q_agent.observe_and_act(observation, reward, count, p_matrix,
                                                                     prev_value_dict, count_cvg)

        if count_cvg == T_CVG:
            print 'policy converged, and round of training %d' % i

            break

        action_evn = list(action)

        observation, reward, terminal = env.step(action_evn)
        total += reward

        if (count) % PERIOD == 0:
            accum_reward = total / float(count)

            duration = time.time() - start_time
            f.write('Index %d: accu_reward is %f, action is: %s and time duration is %f' % (
                count, accum_reward, str(action), duration))
            f.write('\n')
    f.close()

    f_result.write('count_cvg is %d' % count_cvg)
    f_result.write('\n')
    f_result.write(str(prev_value_dict))
    f_result.write('\n')

    # evaluation


    total = 0

    fileName = fileName + '_target'
    f = open(fileName, 'w')

    start_time = time.time()

    for i in range(T_EVAL):
        count = i + 1

        if type(observation).__module__ == 'numpy':
            observation = tuple(observation.tolist())

        else:
            print 'train finished earlier'

        action = q_agent.target_observe_and_act(observation, reward, count, p_matrix)

        action_evn = list(action)

        # if count <= 50:
        #
        #   print('observation')
        #   print(observation)
        #
        #   print('action')
        #   print(action_evn)

        observation, reward, terminal = env.step(action_evn)
        total += reward

        if (count) % 10 == 0:
            accum_reward = total / float(count)

            duration = time.time() - start_time
            f.write('Index %d: accu_reward is %f, action is: %s and time duration is %f' % (
                count, accum_reward, str(action), duration))
            f.write('\n')
    f.close()

    duration = time.time() - start_time
    count = i + 1
    accum_reward = total / float(count)
    duration = time.time() - start_time
    f_result.write('Q table_suff final accu_reward is %f and time duration is %f\n' % (accum_reward, duration))
Ejemplo n.º 10
0
def run_test(f_result, p_matrix=P_MATRIX, fileName='log_q_table', history=AGENT_STATE_WINDOWS_SIZE):
    all_actions_list = tuple(product(range(0, N_CHANNELS), repeat=N_SENSING))

    env = Environment(p_matrix)

    init_state = tuple([tuple([-1 for i in xrange(N_CHANNELS)]) for j in
                        xrange(history)])
    q_agent = QAgent(state_transition_function, init_state, all_actions_list)

    total = 0

    action_evn = [i for i in range(N_SENSING)]  # inital action

    observation, reward, terminal = env.step(action_evn)
    total += reward

    fileName = fileName
    f = open(fileName, 'w')

    start_time = time.time()

    # training
    for i in range(T_THRESHOLD):
        count = i + 1
        observation = tuple(observation.tolist())
        action = q_agent.observe_and_act(observation, reward, count)

        action_evn = list(action)

        observation, reward, terminal = env.step(action_evn)
        total += reward

        if (count) % PERIOD == 0:
            accum_reward = total / float(count)

            duration = time.time() - start_time
            f.write('Index %d: accu_reward is %f, action is: %s and time duration is %f' % (
                count, accum_reward, str(action), duration))
            f.write('\n')
    f.close()

    # evaluation
    total = 0

    fileName = fileName + '_target'
    f = open(fileName, 'w')

    start_time = time.time()

    for i in range(T_EVAL):
        count = i + 1
        observation = tuple(observation.tolist())
        action = q_agent.target_observe_and_act(observation, reward, count)

        action_evn = list(action)

        # if count <= 50:
        #
        #   print('observation')
        #   print(observation)
        #
        #   print('action')
        #   print(action_evn)

        observation, reward, terminal = env.step(action_evn)
        total += reward

        if (count) % PERIOD == 0:
            accum_reward = total / float(count)

            duration = time.time() - start_time
            f.write('Index %d: accu_reward is %f, action is: %s and time duration is %f' % (
                count, accum_reward, str(action), duration))
            f.write('\n')
    f.close()

    count = i + 1
    accum_reward = total / float(count)
    duration = time.time() - start_time
    f_result.write('Q table final accu_reward is %f and time duration is %f\n' % (accum_reward, duration))
Ejemplo n.º 11
0
    return action_evn


p_matrix = [[(0.6, 0.4), (0.2, 0.8)]] * N_CHANNELS

# step 1: init BrainDQN
env = Environment(p_matrix)

brain = BrainDQN()

fileName = 'log_DQN_temp'

action = np.zeros(int(ACTION_SIZE))
action[0] = 1
action_env = process(action)
observation, reward, terminal = env.step(action_env)

brain.setInitState(observation)

index = 0
total = 0

start_time = time.time()

f = open(fileName, 'w')

# step 2: play the game while learning
while index <= T_THRESHOLD:

    index += 1
Ejemplo n.º 12
0
def run_test(f_result,
             p_matrix=P_MATRIX,
             fileName='log_q_table',
             history=AGENT_STATE_WINDOWS_SIZE):
    all_actions_list = tuple(product(range(0, N_CHANNELS), repeat=N_SENSING))

    env = Environment(p_matrix)

    init_state = tuple(
        [tuple([-1 for i in xrange(N_CHANNELS)]) for j in xrange(history)])
    q_agent = QAgent(state_transition_function, init_state, all_actions_list)

    total = 0

    action_evn = [i for i in range(N_SENSING)]  # inital action

    observation, reward, terminal = env.step(action_evn)
    total += reward

    fileName = fileName
    f = open(fileName, 'w')

    start_time = time.time()

    # training
    for i in range(T_THRESHOLD):
        count = i + 1
        observation = tuple(observation.tolist())
        action = q_agent.observe_and_act(observation, reward, count)

        action_evn = list(action)

        observation, reward, terminal = env.step(action_evn)
        total += reward

        if (count) % PERIOD == 0:
            accum_reward = total / float(count)

            duration = time.time() - start_time
            f.write(
                'Index %d: accu_reward is %f, action is: %s and time duration is %f'
                % (count, accum_reward, str(action), duration))
            f.write('\n')
    f.close()

    # evaluation
    total = 0

    fileName = fileName + '_target'
    f = open(fileName, 'w')

    start_time = time.time()

    for i in range(T_EVAL):
        count = i + 1
        observation = tuple(observation.tolist())
        action = q_agent.target_observe_and_act(observation, reward, count)

        action_evn = list(action)

        # if count <= 50:
        #
        #   print('observation')
        #   print(observation)
        #
        #   print('action')
        #   print(action_evn)

        observation, reward, terminal = env.step(action_evn)
        total += reward

        if (count) % PERIOD == 0:
            accum_reward = total / float(count)

            duration = time.time() - start_time
            f.write(
                'Index %d: accu_reward is %f, action is: %s and time duration is %f'
                % (count, accum_reward, str(action), duration))
            f.write('\n')
    f.close()

    count = i + 1
    accum_reward = total / float(count)
    duration = time.time() - start_time
    f_result.write(
        'Q table final accu_reward is %f and time duration is %f\n' %
        (accum_reward, duration))
Ejemplo n.º 13
0
def run_test(f_result,
             p_matrix,
             fileName='log_q_table_suff',
             history=AGENT_STATE_WINDOWS_SIZE):
    # all_states_list = tuple(product(range(-1, 2), repeat=N_CHANNELS))
    all_actions_list = tuple(product(range(0, N_CHANNELS), repeat=N_SENSING))
    # all_observations_list = tuple(product(all_states_list,repeat=AGENT_STATE_WINDOWS_SIZE))

    # p_matrix = [[(0.6, 0.4), (0.2, 0.8)]] * N_CHANNELS

    env = Environment(p_matrix)

    init_state = set_init_state(p_matrix)

    q_agent = QAgent(state_transition_function, init_state, all_actions_list)

    total = 0

    action_evn = [i for i in range(N_SENSING)]  # inital action

    observation, reward, terminal = env.step(action_evn)
    total += reward

    fileName = fileName
    f = open(fileName, 'w')

    start_time = time.time()

    prev_value_dict = {}
    count_cvg = 0

    for i in range(T_THRESHOLD):
        count = i + 1
        observation = tuple(observation.tolist())
        action, prev_value_dict, count_cvg = q_agent.observe_and_act(
            observation, reward, count, p_matrix, prev_value_dict, count_cvg)

        if count_cvg == T_CVG:
            print 'policy converged, and round of training %d' % i

            break

        action_evn = list(action)

        observation, reward, terminal = env.step(action_evn)
        total += reward

        if (count) % PERIOD == 0:
            accum_reward = total / float(count)

            duration = time.time() - start_time
            f.write(
                'Index %d: accu_reward is %f, action is: %s and time duration is %f'
                % (count, accum_reward, str(action), duration))
            f.write('\n')
    f.close()

    f_result.write('count_cvg is %d' % count_cvg)
    f_result.write('\n')
    f_result.write(str(prev_value_dict))
    f_result.write('\n')

    # evaluation

    total = 0

    fileName = fileName + '_target'
    f = open(fileName, 'w')

    start_time = time.time()

    for i in range(T_EVAL):
        count = i + 1

        if type(observation).__module__ == 'numpy':
            observation = tuple(observation.tolist())

        else:
            print 'train finished earlier'

        action = q_agent.target_observe_and_act(observation, reward, count,
                                                p_matrix)

        action_evn = list(action)

        # if count <= 50:
        #
        #   print('observation')
        #   print(observation)
        #
        #   print('action')
        #   print(action_evn)

        observation, reward, terminal = env.step(action_evn)
        total += reward

        if (count) % 10 == 0:
            accum_reward = total / float(count)

            duration = time.time() - start_time
            f.write(
                'Index %d: accu_reward is %f, action is: %s and time duration is %f'
                % (count, accum_reward, str(action), duration))
            f.write('\n')
    f.close()

    duration = time.time() - start_time
    count = i + 1
    accum_reward = total / float(count)
    duration = time.time() - start_time
    f_result.write(
        'Q table_suff final accu_reward is %f and time duration is %f\n' %
        (accum_reward, duration))
Ejemplo n.º 14
0
def run_test(f_result, p_matrix=P_MATRIX, fileName='log_q_table', history=AGENT_STATE_WINDOWS_SIZE):
    all_actions_list = tuple(product(range(0, N_CHANNELS), repeat=N_SENSING))

    env = Environment(p_matrix)

    # init_state = tuple([tuple([-1 for i in xrange(N_CHANNELS)]) for j in xrange(history)])


    init_state = tuple([tuple([0]+[-1 for i in range(N_CHANNELS-1)])])
    q_agent = QAgent(state_transition_function, init_state, all_actions_list)

    total = 0

    action_evn = [i for i in range(N_SENSING)]  # inital action

    observation, reward, terminal = env.step(action_evn)
    total += reward

    fileName = fileName
    f = open(fileName, 'w')

    start_time = time.time()

    prev_value_dict = {}
    count_cvg = 0

    # training
    for i in range(T_THRESHOLD):
        count = i + 1
        observation = tuple(observation.tolist())
        action, prev_value_dict, count_cvg = q_agent.observe_and_act(observation, reward, count, prev_value_dict,
                                                                     count_cvg)

        # if count_cvg == T_CVG:
        #     print 'policy converged, and round of training %d' % i
        #
        #     break


        action_evn = list(action)

        observation, reward, terminal = env.step(action_evn)
        total += reward

        if (count) % PERIOD == 0:
            accum_reward = total / float(count)

            duration = time.time() - start_time
            f.write('Index %d: accu_reward is %f, action is: %s and time duration is %f' % (
                count, accum_reward, str(action), duration))
            f.write('\n')
    f.close()

    f_result.write('count_cvg is %d' % count_cvg)
    f_result.write('\n')
    f_result.write(str(prev_value_dict))
    f_result.write('\n')
    f_result.write('final policy\n')
    f_result.write(str(q_agent.get_policy()))
    f_result.write('\n')

    # evaluation
    accum_reward = 0

    # fileName = fileName + '_target'
    # f = open(fileName, 'w')

    start_time = time.time()

    gamma = GAMMA

    for j in range(T_TIMES):

        total = 0
        env = Environment(p_matrix)
        observation = np.array([0] + [-1 for l in range(N_CHANNELS-1)])


        reward = 0

        for i in range(T_EVAL):
            count = i + 1

            # if type(observation).__module__ == 'numpy':
            #     observation = tuple(observation.tolist())
            #
            # else:
            #     print 'train finished'


            observation = tuple(observation.tolist())
            action = q_agent.target_observe_and_act(observation, reward, count)

            action_evn = list(action)

            # if count <= 50:
            #
            #   print('observation')
            #   print(observation)
            #
            #   print('action')
            #   print(action_evn)

            observation, reward, terminal = env.step(action_evn)

            total += reward*(gamma**i)

            # if (count) % PERIOD == 0:
            #     accum_reward = total
            #
                #duration = time.time() - start_time
                # f.write('Index %d: accu_reward is %f, action is: %s and time duration is %f' % (
                    #count, accum_reward, str(action), duration))
                # f.write('\n')
        # f.close()

        count = i + 1
        accum_reward += total
        duration = time.time() - start_time
        f_result.write('Q table final accu_reward is %f and time duration is %f\n' % (total, duration))

    return accum_reward
Ejemplo n.º 15
0
def run_test(f_result,
             p_matrix=P_MATRIX,
             fileName='log_q_table',
             history=AGENT_STATE_WINDOWS_SIZE):
    all_actions_list = tuple(product(range(0, N_CHANNELS), repeat=N_SENSING))

    env = Environment(p_matrix)

    # init_state = tuple([tuple([-1 for i in xrange(N_CHANNELS)]) for j in xrange(history)])

    init_state = tuple([tuple([0] + [-1 for i in range(N_CHANNELS - 1)])])
    q_agent = QAgent(state_transition_function, init_state, all_actions_list)

    total = 0

    action_evn = [i for i in range(N_SENSING)]  # inital action

    observation, reward, terminal = env.step(action_evn)
    total += reward

    fileName = fileName
    f = open(fileName, 'w')

    start_time = time.time()

    prev_value_dict = {}
    count_cvg = 0

    # training
    for i in range(T_THRESHOLD):
        count = i + 1
        observation = tuple(observation.tolist())
        action, prev_value_dict, count_cvg = q_agent.observe_and_act(
            observation, reward, count, prev_value_dict, count_cvg)

        # if count_cvg == T_CVG:
        #     print 'policy converged, and round of training %d' % i
        #
        #     break

        action_evn = list(action)

        observation, reward, terminal = env.step(action_evn)
        total += reward

        if (count) % PERIOD == 0:
            accum_reward = total / float(count)

            duration = time.time() - start_time
            f.write(
                'Index %d: accu_reward is %f, action is: %s and time duration is %f'
                % (count, accum_reward, str(action), duration))
            f.write('\n')
    f.close()

    f_result.write('count_cvg is %d' % count_cvg)
    f_result.write('\n')
    f_result.write(str(prev_value_dict))
    f_result.write('\n')
    f_result.write('final policy\n')
    f_result.write(str(q_agent.get_policy()))
    f_result.write('\n')

    # evaluation
    accum_reward = 0

    # fileName = fileName + '_target'
    # f = open(fileName, 'w')

    start_time = time.time()

    gamma = GAMMA

    for j in range(T_TIMES):

        total = 0
        env = Environment(p_matrix)
        observation = np.array([0] + [-1 for l in range(N_CHANNELS - 1)])

        reward = 0

        for i in range(T_EVAL):
            count = i + 1

            # if type(observation).__module__ == 'numpy':
            #     observation = tuple(observation.tolist())
            #
            # else:
            #     print 'train finished'

            observation = tuple(observation.tolist())
            action = q_agent.target_observe_and_act(observation, reward, count)

            action_evn = list(action)

            # if count <= 50:
            #
            #   print('observation')
            #   print(observation)
            #
            #   print('action')
            #   print(action_evn)

            observation, reward, terminal = env.step(action_evn)

            total += reward * (gamma**i)

            # if (count) % PERIOD == 0:
            #     accum_reward = total
            #
            #duration = time.time() - start_time
            # f.write('Index %d: accu_reward is %f, action is: %s and time duration is %f' % (
            #count, accum_reward, str(action), duration))
            # f.write('\n')
        # f.close()

        count = i + 1
        accum_reward += total
        duration = time.time() - start_time
        f_result.write(
            'Q table final accu_reward is %f and time duration is %f\n' %
            (total, duration))

    return accum_reward