def run_whittleIndex(f_result, p_matrix=P_DISTINCT_MATRIX, fileName='log_whittle'): #fileName = fileName # f = open(fileName, 'w') # start_time = time.time() accum_reward = 0 for j in range(T_TIMES): total = 0 gamma = GAMMA env = Environment(p_matrix) print p_matrix brain = WhittleIndex(B, gamma, p_matrix) action = [i for i in range(N_SENSING)] for i in range(T_EVAL): observation, reward, terminal = env.step(action) total += reward * (gamma**i) action = brain.getAction(action, observation, 0) count = i + 1 # if count % PERIOD == 0: # accum_reward = total # duration = time.time() - start_time # f.write('Index %d: accu_reward is %f, action is: %s and time duration is %f' % ( # count, accum_reward, str(action), duration)) # f.write('\n') # f.close() duration = time.time() - start_time count = i + 1 accum_reward += total duration = time.time() - start_time f_result.write( 'Whittle Index final accu_reward is %f and time duration is %f\n' % (total, duration)) return accum_reward
def run_whittleIndex(f_result, p_matrix=P_DISTINCT_MATRIX, fileName='log_whittle'): #fileName = fileName # f = open(fileName, 'w') # start_time = time.time() accum_reward = 0 for j in range(T_TIMES): total = 0 gamma = GAMMA env = Environment(p_matrix) print p_matrix brain = WhittleIndex(B, gamma, p_matrix) action = [i for i in range(N_SENSING)] for i in range(T_EVAL): observation, reward, terminal = env.step(action) total += reward*(gamma**i) action = brain.getAction(action, observation, 0) count = i + 1 # if count % PERIOD == 0: # accum_reward = total # duration = time.time() - start_time # f.write('Index %d: accu_reward is %f, action is: %s and time duration is %f' % ( # count, accum_reward, str(action), duration)) # f.write('\n') # f.close() duration = time.time() - start_time count = i + 1 accum_reward += total duration = time.time() - start_time f_result.write('Whittle Index final accu_reward is %f and time duration is %f\n' % (total, duration)) return accum_reward
def run_random(f_result, p_matrix=P_DISTINCT_MATRIX, fileName='log_random'): # step 1: init BrainDQN env = Environment(p_matrix) brain = RandomPolciy() fileName = fileName action = np.zeros(int(ACTION_SIZE)) action[0] = 1 action_env = _process(action) observation, reward, terminal = env.step(action_env) count = 0 total = 0 total += reward start_time = time.time() f = open(fileName, 'w') for i in range(T_THRESHOLD): count = i + 1 action = brain.getAction() action_env = _process(action) observation, reward, terminal = env.step(action_env) total += reward duration = time.time() - start_time if count % PERIOD == 0: accum_reward = total / float(count) duration = time.time() - start_time f.write( 'Index %d: accu_reward is %f, action is: %s and time duration is %f' % (count, accum_reward, str(action_env), duration)) f.write('\n') f.close() duration = time.time() - start_time count = i + 1 accum_reward = total / float(count) duration = time.time() - start_time f_result.write('Random final accu_reward is %f and time duration is %f\n' % (accum_reward, duration))
def run_random(f_result, p_matrix=P_DISTINCT_MATRIX, fileName='log_random'): # step 1: init BrainDQN env = Environment(p_matrix) brain = RandomPolciy() fileName = fileName action = np.zeros(int(ACTION_SIZE)) action[0] = 1 action_env = _process(action) observation, reward, terminal = env.step(action_env) count = 0 total = 0 total += reward start_time = time.time() f = open(fileName, 'w') for i in range(T_THRESHOLD): count = i + 1 action = brain.getAction() action_env = _process(action) observation, reward, terminal = env.step(action_env) total += reward duration = time.time() - start_time if count % PERIOD == 0: accum_reward = total / float(count) duration = time.time() - start_time f.write('Index %d: accu_reward is %f, action is: %s and time duration is %f' % ( count, accum_reward, str(action_env), duration)) f.write('\n') f.close() duration = time.time() - start_time count = i + 1 accum_reward = total / float(count) duration = time.time() - start_time f_result.write('Random final accu_reward is %f and time duration is %f\n' % (accum_reward, duration))
def target_network_eval(self, fileName, p_matrix, f_result): env = Environment(p_matrix) action = np.zeros(int(ACTION_SIZE)) action[0] = 1 action_env = self.process(action) observation, reward, terminal = env.step(action_env) currentState = self.setInitState(observation) count = 0 total = 0 total += reward epsilon = INITIAL_EPSILON print("Start target evaluation") start_time = time.time() f = open(fileName, 'w') while count < T_EVAL: count += 1 if count <= 10: action = np.zeros(int(ACTION_SIZE)) action_index = random.randrange(ACTION_SIZE) action[int(action_index)] = 1 else: action = self.get_target_action(currentState, epsilon, count) action_env = self.process(action) if count > 15 and count < 50: print('observation') print(observation) print('action') print(action_env) observation, reward, terminal = env.step(action_env) total += reward nextState = np.concatenate( (currentState[CHANNEL_SIZE:], observation)) currentState = nextState if (count + 1) % PERIOD == 0: duration = time.time() - start_time accum_reward = total / float(count + 1) f.write( 'Index %d: accu_reward is %f, action is: %s and time duration is %f' % (count + 1, accum_reward, str(action_env), duration)) f.write('\n') f.close() duration = time.time() - start_time accum_reward = total / float(count) f_result.write( 'Async Qlearing using target final accu_reward is %f, and time duration is %f\n' % (accum_reward, duration)) print("target evaluation ends")
def q_learner_thread(self, thread_id, fileName, p_matrix, f_result): env = Environment(p_matrix) action = np.zeros(int(ACTION_SIZE)) action[0] = 1 action_env = self.process(action) observation, reward, terminal = env.step(action_env) currentState = self.setInitState(observation) state_batch = [] y_batch = [] action_batch = [] nextState_batch = [] reward_batch = [] terminal_batch = [] count = 0 total = 0 total += reward epsilon = INITIAL_EPSILON final_epsilon = self.sample_final_epsilon() print("Start thread %d" % thread_id) time.sleep(3 * thread_id) start_time = time.time() f = open(fileName, 'w') while count < T_THRESHOLD: count += 1 state_batch.append(currentState) action = self.getAction(currentState, epsilon) action_batch.append(action) action_env = self.process(action) observation, reward, terminal = env.step(action_env) total += reward reward_batch.append(reward) terminal_batch.append(terminal) nextState = np.concatenate( (currentState[CHANNEL_SIZE:], observation)) nextState_batch.append(nextState) currentState = nextState if count % ASYNC_UPDATE_INTERVAL == 0: q_values_batch = self.q_values_T.eval( session=self.session, feed_dict={self.state_placeholder_T: nextState_batch}) for i in range(ASYNC_UPDATE_INTERVAL): if terminal_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * np.max(q_values_batch[i])) self.train_op.run(session=self.session, feed_dict={ self.state_placeholder: state_batch, self.action_placeholder: action_batch, self.y_placeholder: y_batch }) state_batch = [] y_batch = [] action_batch = [] nextState_batch = [] reward_batch = [] terminal_batch = [] if count % TARGET_UPDATE_INTERVAL == 0: self.session.run(self.updateTargetNetwork) # change episilon if epsilon > final_epsilon and count > OBSERVE: epsilon -= (INITIAL_EPSILON - final_epsilon) / EXPLORE if (count + 1) % PERIOD == 0: duration = time.time() - start_time accum_reward = total / float(count + 1) f.write( 'Index %d: accu_reward is %f, action is: %s and time duration is %f' % (count + 1, accum_reward, str(action_env), duration)) f.write('\n') f.close() duration = time.time() - start_time accum_reward = total / float(count) f_result.write( 'Async Qlearing of thread %d final accu_reward is %f, and time duration is %f\n' % (thread_id, accum_reward, duration)) print("thread %d end" % thread_id)
def q_learner_thread(self, thread_id, fileName, p_matrix, f_result): env = Environment(p_matrix) action = np.zeros(int(ACTION_SIZE)) action[0] = 1 action_env = self.process(action) observation, reward, terminal = env.step(action_env) currentState = self.setInitState(observation) state_batch = [] y_batch = [] action_batch = [] nextState_batch = [] reward_batch = [] terminal_batch = [] count = 0 total = 0 total += reward epsilon = INITIAL_EPSILON final_epsilon = self.sample_final_epsilon() print("Start thread %d" % thread_id) time.sleep(3 * thread_id) start_time = time.time() f = open(fileName, 'w') while count < T_THRESHOLD: count += 1 state_batch.append(currentState) action = self.getAction(currentState, epsilon) action_batch.append(action) action_env = self.process(action) observation, reward, terminal = env.step(action_env) total += reward reward_batch.append(reward) terminal_batch.append(terminal) nextState = np.concatenate((currentState[CHANNEL_SIZE:], observation)) nextState_batch.append(nextState) currentState = nextState if count % ASYNC_UPDATE_INTERVAL == 0: q_values_batch = self.q_values_T.eval(session=self.session, feed_dict={self.state_placeholder_T: nextState_batch}) for i in range(ASYNC_UPDATE_INTERVAL): if terminal_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * np.max(q_values_batch[i])) self.train_op.run(session=self.session, feed_dict={self.state_placeholder: state_batch, self.action_placeholder: action_batch, self.y_placeholder: y_batch}) state_batch = [] y_batch = [] action_batch = [] nextState_batch = [] reward_batch = [] terminal_batch = [] if count % TARGET_UPDATE_INTERVAL == 0: self.session.run(self.updateTargetNetwork) # change episilon if epsilon > final_epsilon and count > OBSERVE: epsilon -= (INITIAL_EPSILON - final_epsilon) / EXPLORE if (count + 1) % PERIOD == 0: duration = time.time() - start_time accum_reward = total / float(count + 1) f.write('Index %d: accu_reward is %f, action is: %s and time duration is %f' % ( count + 1, accum_reward, str(action_env), duration)) f.write('\n') f.close() duration = time.time() - start_time accum_reward = total / float(count) f_result.write('Async Qlearing of thread %d final accu_reward is %f, and time duration is %f\n' % ( thread_id, accum_reward, duration)) print("thread %d end" % thread_id)
def target_network_eval(self, fileName, p_matrix, f_result): env = Environment(p_matrix) action = np.zeros(int(ACTION_SIZE)) action[0] = 1 action_env = self.process(action) observation, reward, terminal = env.step(action_env) currentState = self.setInitState(observation) count = 0 total = 0 total += reward epsilon = INITIAL_EPSILON print("Start target evaluation") start_time = time.time() f = open(fileName, 'w') while count < T_EVAL: count += 1 if count <= 10: action = np.zeros(int(ACTION_SIZE)) action_index = random.randrange(ACTION_SIZE) action[int(action_index)] = 1 else: action = self.get_target_action(currentState, epsilon, count) action_env = self.process(action) if count > 15 and count < 50: print('observation') print(observation) print('action') print(action_env) observation, reward, terminal = env.step(action_env) total += reward nextState = np.concatenate((currentState[CHANNEL_SIZE:], observation)) currentState = nextState if (count + 1) % PERIOD == 0: duration = time.time() - start_time accum_reward = total / float(count + 1) f.write('Index %d: accu_reward is %f, action is: %s and time duration is %f' % ( count + 1, accum_reward, str(action_env), duration)) f.write('\n') f.close() duration = time.time() - start_time accum_reward = total / float(count) f_result.write( 'Async Qlearing using target final accu_reward is %f, and time duration is %f\n' % (accum_reward, duration)) print("target evaluation ends")
def run_test(f_result, p_matrix, fileName='log_q_table_suff', history=AGENT_STATE_WINDOWS_SIZE): # all_states_list = tuple(product(range(-1, 2), repeat=N_CHANNELS)) all_actions_list = tuple(product(range(0, N_CHANNELS), repeat=N_SENSING)) # all_observations_list = tuple(product(all_states_list,repeat=AGENT_STATE_WINDOWS_SIZE)) # p_matrix = [[(0.6, 0.4), (0.2, 0.8)]] * N_CHANNELS env = Environment(p_matrix) init_state = set_init_state(p_matrix) q_agent = QAgent(state_transition_function, init_state, all_actions_list) total = 0 action_evn = [i for i in range(N_SENSING)] # inital action observation, reward, terminal = env.step(action_evn) total += reward fileName = fileName f = open(fileName, 'w') start_time = time.time() prev_value_dict = {} count_cvg = 0 for i in range(T_THRESHOLD): count = i + 1 observation = tuple(observation.tolist()) action, prev_value_dict, count_cvg = q_agent.observe_and_act(observation, reward, count, p_matrix, prev_value_dict, count_cvg) if count_cvg == T_CVG: print 'policy converged, and round of training %d' % i break action_evn = list(action) observation, reward, terminal = env.step(action_evn) total += reward if (count) % PERIOD == 0: accum_reward = total / float(count) duration = time.time() - start_time f.write('Index %d: accu_reward is %f, action is: %s and time duration is %f' % ( count, accum_reward, str(action), duration)) f.write('\n') f.close() f_result.write('count_cvg is %d' % count_cvg) f_result.write('\n') f_result.write(str(prev_value_dict)) f_result.write('\n') # evaluation total = 0 fileName = fileName + '_target' f = open(fileName, 'w') start_time = time.time() for i in range(T_EVAL): count = i + 1 if type(observation).__module__ == 'numpy': observation = tuple(observation.tolist()) else: print 'train finished earlier' action = q_agent.target_observe_and_act(observation, reward, count, p_matrix) action_evn = list(action) # if count <= 50: # # print('observation') # print(observation) # # print('action') # print(action_evn) observation, reward, terminal = env.step(action_evn) total += reward if (count) % 10 == 0: accum_reward = total / float(count) duration = time.time() - start_time f.write('Index %d: accu_reward is %f, action is: %s and time duration is %f' % ( count, accum_reward, str(action), duration)) f.write('\n') f.close() duration = time.time() - start_time count = i + 1 accum_reward = total / float(count) duration = time.time() - start_time f_result.write('Q table_suff final accu_reward is %f and time duration is %f\n' % (accum_reward, duration))
def run_test(f_result, p_matrix=P_MATRIX, fileName='log_q_table', history=AGENT_STATE_WINDOWS_SIZE): all_actions_list = tuple(product(range(0, N_CHANNELS), repeat=N_SENSING)) env = Environment(p_matrix) init_state = tuple([tuple([-1 for i in xrange(N_CHANNELS)]) for j in xrange(history)]) q_agent = QAgent(state_transition_function, init_state, all_actions_list) total = 0 action_evn = [i for i in range(N_SENSING)] # inital action observation, reward, terminal = env.step(action_evn) total += reward fileName = fileName f = open(fileName, 'w') start_time = time.time() # training for i in range(T_THRESHOLD): count = i + 1 observation = tuple(observation.tolist()) action = q_agent.observe_and_act(observation, reward, count) action_evn = list(action) observation, reward, terminal = env.step(action_evn) total += reward if (count) % PERIOD == 0: accum_reward = total / float(count) duration = time.time() - start_time f.write('Index %d: accu_reward is %f, action is: %s and time duration is %f' % ( count, accum_reward, str(action), duration)) f.write('\n') f.close() # evaluation total = 0 fileName = fileName + '_target' f = open(fileName, 'w') start_time = time.time() for i in range(T_EVAL): count = i + 1 observation = tuple(observation.tolist()) action = q_agent.target_observe_and_act(observation, reward, count) action_evn = list(action) # if count <= 50: # # print('observation') # print(observation) # # print('action') # print(action_evn) observation, reward, terminal = env.step(action_evn) total += reward if (count) % PERIOD == 0: accum_reward = total / float(count) duration = time.time() - start_time f.write('Index %d: accu_reward is %f, action is: %s and time duration is %f' % ( count, accum_reward, str(action), duration)) f.write('\n') f.close() count = i + 1 accum_reward = total / float(count) duration = time.time() - start_time f_result.write('Q table final accu_reward is %f and time duration is %f\n' % (accum_reward, duration))
# def preprocess(observation) # change one-hot action vector to action action in the environment def process(action): action_id = np.nonzero(action)[0] action_evn = list(ACTION_SPACE[int(action_id)]) return action_evn p_matrix = [[(0.6, 0.4), (0.2, 0.8)]] * N_CHANNELS # step 1: init BrainDQN env = Environment(p_matrix) brain = BrainDQN() fileName = 'log_DQN_temp' action = np.zeros(int(ACTION_SIZE)) action[0] = 1 action_env = process(action) observation, reward, terminal = env.step(action_env) brain.setInitState(observation) index = 0 total = 0
def run_test(f_result, p_matrix=P_MATRIX, fileName='log_q_table', history=AGENT_STATE_WINDOWS_SIZE): all_actions_list = tuple(product(range(0, N_CHANNELS), repeat=N_SENSING)) env = Environment(p_matrix) init_state = tuple( [tuple([-1 for i in xrange(N_CHANNELS)]) for j in xrange(history)]) q_agent = QAgent(state_transition_function, init_state, all_actions_list) total = 0 action_evn = [i for i in range(N_SENSING)] # inital action observation, reward, terminal = env.step(action_evn) total += reward fileName = fileName f = open(fileName, 'w') start_time = time.time() # training for i in range(T_THRESHOLD): count = i + 1 observation = tuple(observation.tolist()) action = q_agent.observe_and_act(observation, reward, count) action_evn = list(action) observation, reward, terminal = env.step(action_evn) total += reward if (count) % PERIOD == 0: accum_reward = total / float(count) duration = time.time() - start_time f.write( 'Index %d: accu_reward is %f, action is: %s and time duration is %f' % (count, accum_reward, str(action), duration)) f.write('\n') f.close() # evaluation total = 0 fileName = fileName + '_target' f = open(fileName, 'w') start_time = time.time() for i in range(T_EVAL): count = i + 1 observation = tuple(observation.tolist()) action = q_agent.target_observe_and_act(observation, reward, count) action_evn = list(action) # if count <= 50: # # print('observation') # print(observation) # # print('action') # print(action_evn) observation, reward, terminal = env.step(action_evn) total += reward if (count) % PERIOD == 0: accum_reward = total / float(count) duration = time.time() - start_time f.write( 'Index %d: accu_reward is %f, action is: %s and time duration is %f' % (count, accum_reward, str(action), duration)) f.write('\n') f.close() count = i + 1 accum_reward = total / float(count) duration = time.time() - start_time f_result.write( 'Q table final accu_reward is %f and time duration is %f\n' % (accum_reward, duration))
def run_test(f_result, p_matrix, fileName='log_q_table_suff', history=AGENT_STATE_WINDOWS_SIZE): # all_states_list = tuple(product(range(-1, 2), repeat=N_CHANNELS)) all_actions_list = tuple(product(range(0, N_CHANNELS), repeat=N_SENSING)) # all_observations_list = tuple(product(all_states_list,repeat=AGENT_STATE_WINDOWS_SIZE)) # p_matrix = [[(0.6, 0.4), (0.2, 0.8)]] * N_CHANNELS env = Environment(p_matrix) init_state = set_init_state(p_matrix) q_agent = QAgent(state_transition_function, init_state, all_actions_list) total = 0 action_evn = [i for i in range(N_SENSING)] # inital action observation, reward, terminal = env.step(action_evn) total += reward fileName = fileName f = open(fileName, 'w') start_time = time.time() prev_value_dict = {} count_cvg = 0 for i in range(T_THRESHOLD): count = i + 1 observation = tuple(observation.tolist()) action, prev_value_dict, count_cvg = q_agent.observe_and_act( observation, reward, count, p_matrix, prev_value_dict, count_cvg) if count_cvg == T_CVG: print 'policy converged, and round of training %d' % i break action_evn = list(action) observation, reward, terminal = env.step(action_evn) total += reward if (count) % PERIOD == 0: accum_reward = total / float(count) duration = time.time() - start_time f.write( 'Index %d: accu_reward is %f, action is: %s and time duration is %f' % (count, accum_reward, str(action), duration)) f.write('\n') f.close() f_result.write('count_cvg is %d' % count_cvg) f_result.write('\n') f_result.write(str(prev_value_dict)) f_result.write('\n') # evaluation total = 0 fileName = fileName + '_target' f = open(fileName, 'w') start_time = time.time() for i in range(T_EVAL): count = i + 1 if type(observation).__module__ == 'numpy': observation = tuple(observation.tolist()) else: print 'train finished earlier' action = q_agent.target_observe_and_act(observation, reward, count, p_matrix) action_evn = list(action) # if count <= 50: # # print('observation') # print(observation) # # print('action') # print(action_evn) observation, reward, terminal = env.step(action_evn) total += reward if (count) % 10 == 0: accum_reward = total / float(count) duration = time.time() - start_time f.write( 'Index %d: accu_reward is %f, action is: %s and time duration is %f' % (count, accum_reward, str(action), duration)) f.write('\n') f.close() duration = time.time() - start_time count = i + 1 accum_reward = total / float(count) duration = time.time() - start_time f_result.write( 'Q table_suff final accu_reward is %f and time duration is %f\n' % (accum_reward, duration))
def run_test(f_result, p_matrix=P_MATRIX, fileName='log_q_table', history=AGENT_STATE_WINDOWS_SIZE): all_actions_list = tuple(product(range(0, N_CHANNELS), repeat=N_SENSING)) env = Environment(p_matrix) # init_state = tuple([tuple([-1 for i in xrange(N_CHANNELS)]) for j in xrange(history)]) init_state = tuple([tuple([0]+[-1 for i in range(N_CHANNELS-1)])]) q_agent = QAgent(state_transition_function, init_state, all_actions_list) total = 0 action_evn = [i for i in range(N_SENSING)] # inital action observation, reward, terminal = env.step(action_evn) total += reward fileName = fileName f = open(fileName, 'w') start_time = time.time() prev_value_dict = {} count_cvg = 0 # training for i in range(T_THRESHOLD): count = i + 1 observation = tuple(observation.tolist()) action, prev_value_dict, count_cvg = q_agent.observe_and_act(observation, reward, count, prev_value_dict, count_cvg) # if count_cvg == T_CVG: # print 'policy converged, and round of training %d' % i # # break action_evn = list(action) observation, reward, terminal = env.step(action_evn) total += reward if (count) % PERIOD == 0: accum_reward = total / float(count) duration = time.time() - start_time f.write('Index %d: accu_reward is %f, action is: %s and time duration is %f' % ( count, accum_reward, str(action), duration)) f.write('\n') f.close() f_result.write('count_cvg is %d' % count_cvg) f_result.write('\n') f_result.write(str(prev_value_dict)) f_result.write('\n') f_result.write('final policy\n') f_result.write(str(q_agent.get_policy())) f_result.write('\n') # evaluation accum_reward = 0 # fileName = fileName + '_target' # f = open(fileName, 'w') start_time = time.time() gamma = GAMMA for j in range(T_TIMES): total = 0 env = Environment(p_matrix) observation = np.array([0] + [-1 for l in range(N_CHANNELS-1)]) reward = 0 for i in range(T_EVAL): count = i + 1 # if type(observation).__module__ == 'numpy': # observation = tuple(observation.tolist()) # # else: # print 'train finished' observation = tuple(observation.tolist()) action = q_agent.target_observe_and_act(observation, reward, count) action_evn = list(action) # if count <= 50: # # print('observation') # print(observation) # # print('action') # print(action_evn) observation, reward, terminal = env.step(action_evn) total += reward*(gamma**i) # if (count) % PERIOD == 0: # accum_reward = total # #duration = time.time() - start_time # f.write('Index %d: accu_reward is %f, action is: %s and time duration is %f' % ( #count, accum_reward, str(action), duration)) # f.write('\n') # f.close() count = i + 1 accum_reward += total duration = time.time() - start_time f_result.write('Q table final accu_reward is %f and time duration is %f\n' % (total, duration)) return accum_reward
def run_test(f_result, p_matrix=P_MATRIX, fileName='log_q_table', history=AGENT_STATE_WINDOWS_SIZE): all_actions_list = tuple(product(range(0, N_CHANNELS), repeat=N_SENSING)) env = Environment(p_matrix) # init_state = tuple([tuple([-1 for i in xrange(N_CHANNELS)]) for j in xrange(history)]) init_state = tuple([tuple([0] + [-1 for i in range(N_CHANNELS - 1)])]) q_agent = QAgent(state_transition_function, init_state, all_actions_list) total = 0 action_evn = [i for i in range(N_SENSING)] # inital action observation, reward, terminal = env.step(action_evn) total += reward fileName = fileName f = open(fileName, 'w') start_time = time.time() prev_value_dict = {} count_cvg = 0 # training for i in range(T_THRESHOLD): count = i + 1 observation = tuple(observation.tolist()) action, prev_value_dict, count_cvg = q_agent.observe_and_act( observation, reward, count, prev_value_dict, count_cvg) # if count_cvg == T_CVG: # print 'policy converged, and round of training %d' % i # # break action_evn = list(action) observation, reward, terminal = env.step(action_evn) total += reward if (count) % PERIOD == 0: accum_reward = total / float(count) duration = time.time() - start_time f.write( 'Index %d: accu_reward is %f, action is: %s and time duration is %f' % (count, accum_reward, str(action), duration)) f.write('\n') f.close() f_result.write('count_cvg is %d' % count_cvg) f_result.write('\n') f_result.write(str(prev_value_dict)) f_result.write('\n') f_result.write('final policy\n') f_result.write(str(q_agent.get_policy())) f_result.write('\n') # evaluation accum_reward = 0 # fileName = fileName + '_target' # f = open(fileName, 'w') start_time = time.time() gamma = GAMMA for j in range(T_TIMES): total = 0 env = Environment(p_matrix) observation = np.array([0] + [-1 for l in range(N_CHANNELS - 1)]) reward = 0 for i in range(T_EVAL): count = i + 1 # if type(observation).__module__ == 'numpy': # observation = tuple(observation.tolist()) # # else: # print 'train finished' observation = tuple(observation.tolist()) action = q_agent.target_observe_and_act(observation, reward, count) action_evn = list(action) # if count <= 50: # # print('observation') # print(observation) # # print('action') # print(action_evn) observation, reward, terminal = env.step(action_evn) total += reward * (gamma**i) # if (count) % PERIOD == 0: # accum_reward = total # #duration = time.time() - start_time # f.write('Index %d: accu_reward is %f, action is: %s and time duration is %f' % ( #count, accum_reward, str(action), duration)) # f.write('\n') # f.close() count = i + 1 accum_reward += total duration = time.time() - start_time f_result.write( 'Q table final accu_reward is %f and time duration is %f\n' % (total, duration)) return accum_reward