def train_dqn(episode): loss = [] agent = DQN(4, 10) init = tf.global_variables_initializer() with tf.Session() as sess: agent.set_session(sess) for e in range(episode): print("Episode {}".format(e)) state = env.reset() state = np.reshape(state, (1, 10)) score = 0 max_steps = 1000 for i in tqdm(range(max_steps)): action = agent.act(state) for i in range(11): reward, next_state, done = env.step(action) score += reward next_state = np.reshape(next_state, (1, 10)) agent.remember(state, action, reward, next_state, done) state = next_state agent.replay() if done: print("") print("episode: {}/{}, score: {}".format(e, episode, score)) time.sleep(2) break loss.append(score) return loss
def train_dqn(episode): action_dict = {0: 'left', 1: 'down', 2: 'right', 3: 'up', 4: 'stay'} loss = [] agent = DQN(5, 10) for e in range(episode): print("Episode {}".format(e)) state = env.reset() state = np.reshape(state, (1, 10)) score = 0 max_steps = 1000 full_msg = '' for i in tqdm(range(max_steps)): action = agent.act(state) for i in range(11): reward, next_state, done, full_msg = env.step(action, full_msg) #time.sleep(2) score += reward next_state = np.reshape(next_state, (1, 10)) agent.remember(state, action, reward, next_state, done) state = next_state agent.replay() if done: print("") print("episode: {}/{}, score: {}".format(e, episode, score)) time.sleep(2) break loss.append(score) return loss
def testing(): gamma = 0.99 batch_sz = 32 num_episodes = 1000 total_t = 0 episode_rewards = np.zeros(num_episodes) last_100_avgs = [] full_msg = '' model = DQN(K=K, input_shape=2 + 2 * number_enemy, scope="model") with tf.Session() as sess: model.set_session(sess) sess.run(tf.global_variables_initializer()) model.load() obs = env.reset() state = obs acc = [1, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2] for i in tqdm(range(int(10e6))): action = np.random.choice(K) obs, reward, done, _, full_msg = env.step(action, full_msg) #time.sleep(0.8) #print(action) if done == 1: done = True else: done = False next_state = obs if done: obs = env.reset() state = obs else: state = next_state
def run(): step_of_each_round = [] for i in range(round): t1 = datetime.datetime.now() print("round :", i) observation = env.reset() # env.plt('start') step = 0 while True: # choose action action = [] for j in range(8): action.append(RL[j].choose_action(observation)) # update environment observation_, reward, done = env.step(action) # 训练一段时间后,更新画面 # if step > 10000: # env.plt('update') # restore memory for j in range(8): RL[j].store_transition(observation, action[j], reward[j], observation_) if (step > 200) and (step % 5 == 0): for j in range(8): RL[j].learn() if done: break observation = observation_ step = step + 1 step_of_each_round.append(step) t2 = datetime.datetime.now() print(t2 - t1) end = datetime.datetime.now() print(end - start) # output data csvFile = open('./data.csv', "a", newline='') data = RL[0].layers data.append(sum(step_of_each_round) / round) data.append(sum(step_of_each_round[-51:-1]) / 50) writer = csv.writer(csvFile, dialect='excel') writer.writerow(data) csvFile.close() print('average step: ', sum(step_of_each_round) / round) print('average step of latest 50 rounds: ', sum(step_of_each_round[-51:-1]) / 50) plt.plot(step_of_each_round) plt.pause(0)
def run(): step_of_each_round = [] for i in range(round): print(i) observation = env.reset() env.plt('start') step = 0 while True: observation_of_agent = [] observation_of_agent_ = [] for j in range(7): observation_of_agent.append(observation[j + 1] - observation[j]) observation_of_agent.append(observation[0] - observation[7]) action_list = np.array([[0., 0.], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]) # choose action action = [] for j in range(8): action.append(RL[j].choose_action(observation_of_agent[j])) if np.linalg.norm(env.E[j]) > 0.1: action_list[j] = vel[action[j]] # update environment observation_, reward, done = env.step(action_list) for j in range(7): observation_of_agent_.append(observation_[j + 1] - observation_[j]) observation_of_agent_.append(observation_[0] - observation_[7]) if i > 50: env.plt('update') # restore memory for j in range(8): RL[j].store_transition(observation_of_agent[j], action[j], reward[j], observation_of_agent_[j]) if (step > 200) and (step % 5 == 0): for j in range(8): RL[j].learn() if done: # env.plt('finish') # RL[1].plot_cost() env.plt('clean') break step = step + 1 step_of_each_round.append(step) plt.ioff() for i in range(8): RL[i].plot_cost() plt.pause(5) print(sum(step_of_each_round) / round) plt.plot(step_of_each_round) plt.pause(0)
def collecting_training_samples(config, mcts, env, temp): env.reset4train() terminal = False reward = 0 data = [] while not terminal: act, acts, act_probs = mcts.get_action(cp.deepcopy(env), temp, training=True) temp = [ env.uid, env.profile, cp.deepcopy(env.trajectory), cp.deepcopy(env.cat_trajectory), env.node_type ] candidate, node_type, reward, terminal = env.step(act) temp.extend([act, acts, act_probs, reward]) data.append(temp) for item in data: item[-1] = reward print(reward) return data, reward
def testing(): gamma = 0.99 batch_sz = 32 num_episodes = 1000 total_t = 0 episode_rewards = np.zeros(num_episodes) last_100_avgs = [] model = DQN(K=K, input_shape=4 + 3 * number_enemy, scope="model") with tf.Session() as sess: model.set_session(sess) sess.run(tf.compat.v1.global_variables_initializer()) model.load() state = env.reset() acc = [1, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2] sum_reward = 0 for i in range(int(10e6)): #if i % 4 == 0: # action = model.sample_action(state, 0.1) #else: action = model.sample_action(state, 0.1) #action = acc[i%len(acc)] next_state, reward, done, _ = env.step(action) time.sleep(0.8) #print(action) done = done == 1 sum_reward += reward if done: obs = env.reset() print("Reward: {}".format(sum_reward)) sum_reward = 0 state = obs else: state = next_state
optimizer = optim.Adam(Q.parameters(), lr=args.lr) # TODO wrap data in dataset for i in range(args.iterations): done = False s = env.reset() # TODO fold into rollout while not done: epsilon = decay_exploration(i, epsilon) a = epsilon_greedy(s, epsilon=epsilon) succ, r, done, _ = env.step(a) replay_buffer.append([s, a, r, succ, done]) s = succ if i % args.batch_size == 0 and i > 0 and len( replay_buffer) >= args.batch_size: # TODO cuda and var state, val = train(replay_buffer, Q) y = Q(state) optimizer.zero_grad() loss = criterion(y, val) loss.backward() print(loss.data[0])
# nextState.shape # (1, 13, 13, 2) gameOver = False while not gameOver: #Selecting an action to play if np.random.rand() <= epsilon: action = np.random.randint(0, 2) else: qvalues = model.predict(currentState)[0] action = np.argmax(qvalues) #Updating the epsilon and saving the model epsilon -= epsilonDecayRate epsilon = max(epsilon, minEpsilon) #Updating the Environment frame, reward, gameOver = env.step(action) if frame.shape[0] != 9: pass else: # increase the signal of pos reward if reward > 0: reward = reward * 1000 # frame.shape # (13,13) frame = np.reshape(frame, (1,env.wind, env.prices_indicators.shape[1], 1)) # frame.shape # (1, 13, 13, 1) nextState = np.append(nextState, frame, axis = 3) # nextState.shape # (1, 13, 13, 3) nextState = np.delete(nextState, 0, axis = 3)
def trainning(): num_action_act = [0, 0, 0, 0, 0] gamma = 0.99 batch_sz = 32 num_episodes = 1000000 total_t = 0 experience_replay_buffer = [] episode_rewards = np.zeros(num_episodes) last_100_avgs = [] full_msg = '' epsilon = 1.0 epsilon_min = 0.1 epsilon_change = (epsilon - epsilon_min) / 1000000 #500000 model = DQN(K=K, input_shape=2 + 2 * number_enemy, scope="model") target_model = DQN(K=K, input_shape=2 + 2 * number_enemy, scope="target_model") with tf.Session() as sess: model.set_session(sess) target_model.set_session(sess) sess.run(tf.global_variables_initializer()) model.load() print("Filling experience replay buffer...") obs = env.reset() state = obs #for i in range(MIN_EXPERIENCES): for i in tqdm(range(MIN_EXPERIENCES)): action = np.random.randint(0, K) num_action_act[action] += 1 obs, reward, done, _, full_msg = env.step(action, full_msg) #time.sleep(0.5) #print(obs) if done == 1: done = True else: done = False next_state = obs experience_replay_buffer.append( (state, action, reward, next_state, done)) if done: obs = env.reset() state = obs else: state = next_state print(num_action_act) for i in range(num_episodes): t0 = datetime.now() obs = env.reset() state = obs loss = None total_time_training = 0 num_steps_in_episode = 0 episode_reward = 0 done = False while True: #for _ in range(0, MAX_STEP): if total_t % TARGET_UPDATE_PERIOD == 0: target_model.copy_from(model) print( "Copied model parameters to target network, total_t = %s, period = %s" % (total_t, TARGET_UPDATE_PERIOD)) action = model.sample_action(state, epsilon) num_action_act[action] += 1 time_act = datetime.now() obs, reward, done, _, full_msg = env.step(action, full_msg) time_act = datetime.now() - time_act if done == 1: done = True else: done = False next_state = obs episode_reward += reward if len(experience_replay_buffer) == MAX_EXPERIENCES: experience_replay_buffer.pop(0) experience_replay_buffer.append( (state, action, reward, next_state, done)) t0_2 = datetime.now() loss = learn(model, target_model, experience_replay_buffer, gamma, batch_sz) dt = datetime.now() - t0_2 #Confirm ''' if time_act > dt: print("Java timeout") else: print("Python timeout") ''' total_time_training += dt.total_seconds() num_steps_in_episode += 1 state = next_state total_t += 1 epsilon = max(epsilon - epsilon_change, epsilon_min) if done: break duration = datetime.now() - t0 episode_rewards[i] = episode_reward time_per_step = total_time_training / num_steps_in_episode last_100_avg = episode_rewards[max(0, i - 100):i].mean() last_100_avgs.append(last_100_avg) #print(i) #print("last 100: ",last_100_avg) #print("reward ",episode_reward) #print("rewards ",episode_rewards) #print("") print("Episode:", i, "Duration:", duration, "Num steps:", num_steps_in_episode, "Reward:", episode_reward, "Training time per step:", "%.3f" % time_per_step, "Avg Reward (last 100):", "%.3f" % last_100_avg, "Epsilon:", "%.3f" % epsilon) if i % 50 == 0: model.save(i) sys.stdout.flush() if np.sum(num_action_act) > 5e6: break plt.plot(last_100_avgs) plt.xlabel('episodes') plt.ylabel('Average Rewards') # plt.show() plt.savefig('result.png') print(num_action_act)
#print(action) else: action = getMaxAction(state_for_action) else: if not q_table[state_for_action]: action = env.action_space.sample() # Explore action space else: action = getMaxAction(state_for_action) # print(action) return action action = makeAction(state) next_state_to_process, reward, done, info = env.step(action) #STEP #print(next_state_to_process, reward, done, info) if done: #DONE #print("DONE") break old_value = q_table[state][action] #TODO next_state = makeState(next_state_to_process) next_action = makeAction(next_state, False) next_max = q_table[next_state][next_action] #TODO new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max) q_table[state][action] = new_value # if reward == -10:
def trainning(): gamma = 0.99 batch_sz = 32 num_episodes = 1000000 total_t = 0 experience_replay_buffer = [] episode_rewards = [] last_100_avgs = [] max_eps = [-sys.maxsize] epsilon = 1.0 epsilon_min = 0.01 epsilon_change = (epsilon - epsilon_min) / 1000000 #500000 model = DQN(K=K, input_shape=4 + 3 * number_enemy, scope="model") target_model = DQN(K=K, input_shape=4 + 3 * number_enemy, scope="target_model") with tf.compat.v1.Session() as sess: model.set_session(sess) target_model.set_session(sess) sess.run(tf.compat.v1.global_variables_initializer()) model.load() print("Filling experience replay buffer...") state = env.reset() #for i in range(MIN_EXPERIENCES): for i in tqdm(range(MIN_EXPERIENCES)): action = np.random.randint(0, K) state, reward, done, _ = env.step(action) done = done == 1 #time.sleep(0.5) #print(obs) next_state = state experience_replay_buffer.append( (state, action, reward, next_state, done)) if done: state = env.reset() else: state = next_state try: i = 0 #for i in range(num_episodes): while True: i += 1 state = env.reset() loss = None num_steps_in_episode = 0 episode_reward = 0 done = False for _ in range(MAX_STEP): #while True: if total_t % TARGET_UPDATE_PERIOD == 0: target_model.copy_from(model) print( "Copied model parameters to target network, total_t = %s, period = %s" % (total_t, TARGET_UPDATE_PERIOD)) action = model.sample_action(state, epsilon) next_state, reward, done, _ = env.step(action) done = done == 1 episode_reward += reward if len(experience_replay_buffer) == MAX_EXPERIENCES: experience_replay_buffer.pop(0) experience_replay_buffer.append( (state, action, reward, next_state, done)) loss = learn(model, target_model, experience_replay_buffer, gamma, batch_sz) num_steps_in_episode += 1 state = next_state total_t += 1 epsilon = max(epsilon - epsilon_change, epsilon_min) if done: break #if not done: # episode_reward-=100 episode_rewards.append(episode_reward) #Reward every eps last_100_avg = np.array(episode_rewards[max(0, i - 100):i]).mean() last_100_avgs.append(last_100_avg) #Avg reward every eps max_eps.append(max(max_eps[-1], last_100_avg)) #Max eps print( "Episode: {:>6}, Num steps: {:>3}, Reward: {:>8.3f}, Avg reward: {:>5.3f}, Max: {:>5.3f} Eps: {:>5.3f}" .format(i, num_steps_in_episode, episode_reward, last_100_avg, max_eps[-1], epsilon)) if i % 100 == 0: model.save(i) sys.stdout.flush() if total_t > NUM_FRAME: break except: print("Break") finally: max_eps.pop(0) data = pd.DataFrame({ 'Reward': episode_rewards, 'Avg Reward': last_100_avgs, 'Max': max_eps }) data.to_csv("./data_result.csv") figure(num=None, figsize=(15, 8), dpi=80, facecolor='w', edgecolor='k') plt.plot('Reward', '--', color="#999999", data=data, label="Reward") plt.plot('Avg Reward', data=data, label="Avg Reward") plt.plot('Max', data=data, label="Max") plt.legend(loc="upper left") plt.xlabel('episodes') #plt.show() plt.savefig('result.png')
agent = Agent(num_nodes=env.graph_size) scores = [] for i in range(num_eposides): score = 0 num_nodes, mu, edge_index, edge_w, state, done = env.reset() state_steps = [state] reward_steps = [] action_steps = [] steps_cntr = 0 while not done[0]: action = agent.choose_action(mu, edge_index, edge_w, state) _, _, _, reward, new_state, done = env.step(action) state_steps.append(new_state) reward_steps.append(reward) action_steps.append(action) steps_cntr += 1 if steps_cntr > n_step + 1: agent.remember(num_nodes, mu, edge_index, edge_w, state_steps[-(n_step + 1)], action_steps[-n_step], [sum(reward_steps[-n_step:])], state_steps[-1], done) agent.learn() state = new_state