def step(self, action): a = play(self.S) a = map(float, a.split(",")) self.S, r = simulate(self.S, [a[0], a[1], action * 0.01]) # print "Reward",r if (len(self.S["White_Locations"]) == 0 and len(self.S["Black_Locations"]) == 0 and len(self.S["Red_Location"]) == 0): return vectorize_state(self.S), r + 20, True return vectorize_state(self.S), r, False
def step(self, action): self.S, r = simulate(self.S, [(action[0] + 1) / 2, (action[1] + 1) * 135 - 45, (action[2] + 1) / 2]) print "Reward", r if (len(self.S["White_Locations"]) == 0 and len(self.S["Black_Locations"]) == 0 and len(self.S["Red_Location"]) == 0): return vectorize_state(self.S), r + 20, True return vectorize_state(self.S), r, False
def agent_1player(): maxGames = 50 game = 1 nActions = len(actions) while game <= maxGames: time_step = 0 coins = ONE_PEICE state, reward = parse_state_message(coins, 0) maxTime = 500 # print("Starting game.............", game) while time_step < maxTime: # get Q value for every action Qs = [] scores = np.zeros(nActions) for a in range(nActions): X = np.array(state + actions[a]) Q = forward(X) Qs.append(Q) scores[a] = Q # exp_scores = np.exp(scores) # # get probability distribution # print "exp scores sum", np.sum(exp_scores) probs = scores / np.sum(scores) cdf = [probs[0]] for i in range(1, len(probs)): cdf.append(cdf[-1] + probs[i]) a_index_curr = bisect(cdf, random.random()) # print 'prob sum', np.sum(probs) # for p in probs: # print p # pick action # action_indices = np.arange(nActions) # a_index_curr = stats.rv_discrete(values=(action_indices, probs)).rvs() Qcurr = Qs[a_index_curr] action_picked = actions[a_index_curr] angle = -45 + action_picked[1] * 270 action = [action_picked[0], angle, action_picked[2]] # print 'action', action # simulate coins, reward = one_step.simulate(coins, one_step.validate(action, coins)) if gameEnd(coins): break nextState, reward = parse_state_message(coins, reward) # print(nextState, reward) # greedy for action of next state Qmax, a_index_next = -np.inf, -1 for a in range(nActions): X = np.array(nextState + actions[a]) Qnext = forward(X) if Qnext > Qmax: a_index_next = a Qmax = Qnext # update by sarsa equation update = reward + discount * Qmax - Qcurr # backpropogate X = np.array(state + actions[a_index_curr]) X = X.reshape(1, nn_input_dim) backward(np.array([[update]]), X) # next step time_step = time_step + 1 state = nextState print time_step game = game + 1 model = {'W1': W1, 'W2': W2, 'b1': b1, 'b2': b2} np.save('model.npy', model) # Load read_dictionary = np.load('model.npy').item()
def train(sess, actor, critic): # Set up summary Ops summary_ops, summary_vars = build_summaries() sess.run(tf.initialize_all_variables()) writer = tf.train.SummaryWriter(SUMMARY_DIR, sess.graph) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) for i in xrange(MAX_EPISODES): # s = env.reset() ep_reward = 0 ep_ave_max_q = 0 s = Utils.INITIAL_STATE no_coins_pocketed = 0 for j in xrange(MAX_EP_STEPS): # if RENDER_ENV: # env.render() # Added exploration noise snn = s["White_Locations"] + s["Black_Locations"] + s[ "Red_Location"] snn = snn + [(0, 0)] * (19 - len(snn)) snn = np.reshape(np.array(snn), (1, 38)) # a = actor.predict(snn) + (1. / (1. + i + j)) a = np.add( np.asarray(actor.predict(snn)), np.multiply((10. / (100. + 20 * i + j)) * random.random(), np.asarray(actor.action_bound))) if a[0][0] > 1: a[0][0] = 1 elif a[0][0] < 0: a[0][0] = 0 if a[0][1] > 225: a[0][1] = 225 elif a[0][1] < -45: a[0][1] = -45 if a[0][2] > 1: a[0][2] = 1 elif a[0][2] < 0: a[0][2] = 0 curr_coins = s["White_Locations"] + s["Black_Locations"] + s[ "Red_Location"] # if j==0: print "\n Episode number : " + str(i) + " Step number:" + str( j) + "\n" + str(a[0]) + "\n" + "Coins left:" + str( len(curr_coins)) # print a[0] s2, r = simulate(s, validate(a[0], s)) # if color == "White" : # opp_color = "Black" # else: # opp_color = "White" # my_targets = s2[color+"_Locations"] # opp_targets = s2[opp_color + "_Locations"] my_targets = s2["White_Locations"] + s2["Black_Locations"] + s2[ "Red_Location"] # if j==0: # print "my targets size = " + str(len(my_targets)) + " Reward = " + str(r) if len(my_targets) >= len(curr_coins): no_coins_pocketed += 1 else: no_coins_pocketed = 0 terminal = False if no_coins_pocketed > 50 and len(curr_coins) > 2: terminal = True elif no_coins_pocketed > 100 and len(curr_coins) <= 2: terminal = True if not my_targets: terminal = True if terminal: summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: ep_reward, summary_vars[1]: ep_ave_max_q / float(j) }) writer.add_summary(summary_str, i) writer.flush() print '| Reward: %.2i' % int(ep_reward), " | Episode", i, \ '| Qmax: %.4f' % (ep_ave_max_q / float(j)) break s2nn = s2["White_Locations"] + s2["Black_Locations"] + s2[ "Red_Location"] s2nn = s2nn + [(0, 0)] * (19 - len(s2nn)) s2nn = np.reshape(np.array(s2nn), (1, 38)) # s2, r, terminal, info = env.step(a[0]) replay_buffer.add(np.reshape(snn, (actor.s_dim,)), np.reshape(a, (actor.a_dim,)), r, \ terminal, np.reshape(s2nn, (actor.s_dim,))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MINIBATCH_SIZE: s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(MINIBATCH_SIZE) # Calculate targets target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in xrange(MINIBATCH_SIZE): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + GAMMA * target_q[k]) # Update the critic given the targets predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1))) ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() s = s2 ep_reward += r
def agent_2player(state, color): flag = 1 # print "color : ", color try: # print state state, reward = parse_state_message(state) # Get the state and reward # print state, reward, type(state) except: pass # Assignment 4: your agent's logic should be coded here if not state: # print "\n\n\n\n\n Exiting \n\n\n\n\n" return 0 print "\n\n\n####################\n MY AGENT \n####################\n" if color == "White": opp_color = "Black" else: opp_color = "White" target_coins = state[color + "_Locations"] + state["Red_Location"] queen = state["Red_Location"] opp_coins = state[opp_color + "_Locations"] n_neighbors_max = -50 targets = [] for coin in target_coins: temp = num_neighbors(coin, target_coins, opp_coins, queen) if temp > n_neighbors_max: n_neighbors_max = temp for coin in target_coins: if num_neighbors(coin, target_coins, opp_coins, queen) == n_neighbors_max: targets.append(coin) # print "TARGETS", targets final_target = random.choice(targets) print "FINAL TARGET", final_target (x_loc, angle, force) = best_action(final_target, target_coins + opp_coins) position = float(x_loc - 170) / float(460) # Can be ignored for now a = str(position) + ',' + \ str(angle) + ',' + str(force) print "Action taken : " + a + "\n\n" start_time = timeit.default_timer() (new_state, score_inc) = simulate(state, (x_loc, angle, force)) elapsed = timeit.default_timer() - start_time print "time taken to simulate : " + str(elapsed) + "\n" print "\n\n##################################\n" try: s.send(a) except Exception as e: print "Error in sending:", a, " : ", e print "Closing connection" flag = 0 return flag
# for position in range(30, 73, 3): # f.write(str(position/100) + ' ' + str(angle) + ' ' + str(force/100) + '\n') # Simulate each of the moves above state = Utils.INITIAL_STATE print('Finding best action ...') with open('allFirstMoves', 'r') as f: count = 0 for line in f: count += 1 if (count % 100 == 0): print(count, file=sys.stderr) [position, angle, force] = [float(x) for x in line[:-1].split(' ')] next_state, reward = one_step.simulate( state, one_step.validate([position, angle, force], state)) white_pos = [(x[0], x[1]) for x in next_state["White_Locations"]] black_pos = [(x[0], x[1]) for x in next_state["Black_Locations"]] red_pos = [(x[0], x[1]) for x in next_state["Red_Location"]] print(str(position) + ' ' + str(angle) + ' ' + str(force) + ' ' + str(len(white_pos)) + ' ' + str(len(black_pos)) + ' ' + str(len(red_pos)), end=' ') for x in white_pos: print(str(x[0]) + ' ' + str(x[1]), end=' ') for x in black_pos: print(str(x[0]) + ' ' + str(x[1]), end=' ') for x in red_pos: print(str(x[0]) + ' ' + str(x[1]), end=' ')
def agent_1player(): maxGames = 50 game = 1 while game <= maxGames: time_step = 1 coins2 = TWO_PEICE state, reward = parse_state_message(coins2, 0) maxTime = 500 # print("Starting game.............", game) while time_step <= maxTime: # heuristics ........... coins = coins2["White_Locations"] + coins2[ "Black_Locations"] + coins2["Red_Location"] queen = coins2["Red_Location"] num_coins = len(coins) striker_pos, striker_angle = -1, -1 striker_force = -1 queen_cut_shot_exists = False #if number of coins is less than 10 and cut shot to queen exists, attempt cut shot to queen if len(coins) <= 10 and len(queen) == 1: striker_pos, striker_angle, striker_force = cut_shot( coins, [], 1, queen[0]) if striker_pos > 0: queen_cut_shot_exists = True #If number of coins is less than 5 and cut shot to queen does not exist, aim at the queen directly if not queen_cut_shot_exists and len(coins) <= 5 and len( queen) == 1: striker_pos, striker_angle, striker_force = cut_shot( coins, [], 0, queen[0]) queen_cut_shot_exists = True # cluster begins here if not queen_cut_shot_exists: pos, angle, cluster_size, force = select_cluster_1p(coins, 0) if (pos < 0): pos, angle, cluster_size, force = select_cluster_1p( coins, 1) if (pos >= 0): striker_pos, striker_angle = pos, angle striker_force = force # print "Cluster found close to pocket!" else: striker_pos, striker_angle, striker_force = cut_shot( coins, [], 1) if striker_pos < 0: # print "Closest Coin hit" striker_pos, striker_angle, striker_force = cut_shot( coins, [], 0) # else: # print "cutshot" striker_pos = (striker_pos - 170.0) / 460.0 striker_angle = (striker_angle + 45) / 270.0 # heuristics end .......... # generate action space actions = [] xmin = int((-1) * min(5, (striker_pos / 0.05))) xmax = int(min(5, ((1 - striker_pos) / 0.05))) anglemin = int((-1) * min(5, striker_angle / 0.02)) anglemax = int(min(5, ((1 - striker_angle) / 0.02))) forcemin = int((-1) * min(5, striker_force / 0.02)) forcemax = int(min(5, ((1 - striker_force) / 0.02))) for x in range(xmin, xmax): for angle in range(anglemin, anglemax): for force in range(forcemin, forcemax): actions.append([ striker_pos + 0.05 * x, striker_angle + 0.02 * angle, striker_force + 0.02 * force ]) nActions = len(actions) # get Q value for every action Qs = [] scores = np.zeros(nActions) for a in range(nActions): X = np.array(state + actions[a]) Q = forward(X) Qs.append(Q) scores[a] = Q probs = scores / np.sum(scores) cdf = [probs[0]] for i in range(1, len(probs)): cdf.append(cdf[-1] + probs[i]) a_index_curr = bisect(cdf, random.random()) Qcurr = Qs[a_index_curr] action_picked = actions[a_index_curr] angle = -45 + action_picked[1] * 270 action = [action_picked[0], angle, action_picked[2]] # print 'action', action # simulate coins2, reward = one_step.simulate( coins2, one_step.validate(action, coins2)) if gameEnd(coins2): break nextState, reward = parse_state_message(coins2, reward) if reward <= 0: reward = reward + max(-0.25 * pow(1.1, time_step), -2) # greedy for action of next state Qmax, a_index_next = -np.inf, -1 for a in range(nActions): X = np.array(nextState + actions[a]) Qnext = forward(X) if Qnext > Qmax: a_index_next = a Qmax = Qnext # update by sarsa equation update = reward + discount * Qmax - Qcurr # backpropogate X = np.array(state + actions[a_index_curr]) X = X.reshape(1, nn_input_dim) backward(np.array([[update]]), X) # next step time_step = time_step + 1 state = nextState print time_step game = game + 1 model = {'W1': W1, 'W2': W2, 'b1': b1, 'b2': b2} np.save('model.npy', model) # Load read_dictionary = np.load('model.npy').item()
def trainNetwork(model, mode): # open up a game state to communicate with emulator # game_state = game.GameState() # store the previous observations in replay memory # get the first state by doing nothing and preprocess the image to 80x80x4 # do_nothing = np.zeros(ACTIONS) # do_nothing[0] = 1 # x_t, r_0, terminal = game_state.frame_step(do_nothing) # x_t = skimage.color.rgb2gray(x_t) # x_t = skimage.transform.resize(x_t,(80,80)) # x_t = skimage.exposure.rescale_intensity(x_t,out_range=(0,255)) # s_t = np.stack((x_t, x_t, x_t, x_t), axis=0) #In Keras, need to reshape # s_t = s_t.reshape(1, s_t.shape[0], s_t.shape[1], s_t.shape[2]) # state = np.zeros([input_dimension]) # whites = stateDict["White_Locations"] # blacks = stateDict["Black_Locations"] # rednecks = stateDict["Red_Location"] # for w in whites+blacks+rednecks: # x = w[0]//50 # y = w[1]//50 # state[16*x+y] +=1 # # state= state.T # s_t = state # s_t = s_t.reshape(1,s_t.shape[0]) # print(state) # print ("shape", state.shape) if mode == 'Run': OBSERVE = 999999999 #We keep observe, never train epsilon = FINAL_EPSILON print("Now we load weight") model.load_weights("model.h5") adam = Adam(lr=1e-6) model.compile(loss='mse', optimizer=adam) print("Weight load successfully") else: #We go to training mode OBSERVE = OBSERVATION epsilon = INITIAL_EPSILON f = open("temp.csv", 'w') t = 0 MAX_EPISODES = 10000 MAX_EPISODE_STEPS = 500 file2 = open("ep_end_times.txt", 'w') for ep in xrange(0, MAX_EPISODES): stateDict = Utils.INITIAL_STATE s_t = getStateFromDict(stateDict) # state = s_t for epstep in xrange(0, MAX_EPISODE_STEPS): loss = 0 Q_sa = 0 action_index = 0 r_t = 0 a_t = np.zeros([ACTIONS]) #choose an action epsilon greedy if t % FRAME_PER_ACTION == 0: if random.random() <= epsilon: print("----------Random Action----------") action_index = random.randrange(ACTIONS) a_t[action_index] = 1 else: # print ("st0 " + str(s_t[0]) ) # print ("st shape ",s_t.shape) q = model.predict( s_t) #input a stack of 4 images, get the prediction max_Q = np.argmax(q) action_index = max_Q a_t[max_Q] = 1 #We reduced the epsilon gradually if epsilon > FINAL_EPSILON and t > OBSERVE: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE #run the selected action and observed next state and reward # x_t1_colored, r_t, terminal = game_state.frame_step(a_t) # TODO : implement this # next_state, reward = play(stateDict, action_index) my_action = getActionFromIndex(action_index) next_state_dict, reward = simulate(stateDict, validate(my_action, stateDict)) next_state = getStateFromDict(next_state_dict) w = len(next_state_dict["White_Locations"]) b = len(next_state_dict["Black_Locations"]) r = len(next_state_dict["Red_Location"]) print("COINS : ", w, b, r, w + b + r) # x_t1 = skimage.color.rgb2gray(x_t1_colored) # x_t1 = skimage.transform.resize(x_t1,(80,80)) # x_t1 = skimage.exposure.rescale_intensity(x_t1, out_range=(0, 255)) # x_t1 = x_t1.reshape(1, 1, x_t1.shape[0], x_t1.shape[1]) # s_t1 = np.append(x_t1, s_t[:, :3, :, :], axis=1) # store the transition in D D.append((s_t, action_index, reward, next_state)) if len(D) > REPLAY_MEMORY: D.popleft() #only train if done observing if t > OBSERVE: # if t > 40: # sample a minibatch to train on minibatch = random.sample(D, BATCH) # inputs = np.zeros((BATCH, s_t.shape[1], s_t.shape[2], s_t.shape[3])) #32, 80, 80, 4 inputs = np.zeros((BATCH, s_t.shape[1])) #32, 80, 80, 4 targets = np.zeros((BATCH, ACTIONS)) #32, 2 #Now we do the experience replay for i in range(0, len(minibatch)): state_t = minibatch[i][0] action_t = minibatch[i][1] #This is action index reward_t = minibatch[i][2] state_t1 = minibatch[i][3] # terminal = minibatch[i][4] # if terminated, only equals reward # inputs[i] = state_t #I saved down s_t inputs[i:i + 1] = state_t #I saved down s_t targets[i] = model.predict( state_t) # Hitting each buttom probability Q_sa = model.predict(state_t1) terminal = False if state_t1.sum() < 1: terminal = True if terminal: print( "*************************************Terminal*************************************" ) targets[i, action_t] = reward_t else: targets[i, action_t] = reward_t + GAMMA * np.max(Q_sa) # targets2 = normalize(targets) loss += model.train_on_batch(inputs, targets) stateDict = next_state_dict s_t = next_state t = t + 1 # save progress every 10000 iterations if t % 100 == 0: print("Now we save model") model.save_weights("model.h5", overwrite=True) with open("model.json", "w") as outfile: json.dump(model.to_json(), outfile) # print info state_ = "" if t <= OBSERVE: state_ = "observe" elif t > OBSERVE and t <= OBSERVE + EXPLORE: state_ = "explore" else: state_ = "train" print("EPISODE", ep, "/ EP_STEP ", epstep, "TIMESTEP", t , "/ STATE_", state_, \ "/ EPSILON", epsilon, "/ ACTION", action_index, "/ My Action",my_action, "/ REWARD", reward, \ "/ Q_MAX " , np.max(Q_sa), "/ Loss ", loss) comma = str(',') f.write( str(t) + comma + str(reward) + comma + str(np.max(Q_sa)) + '\n') if (w + b + r) == 0: break print("Episode finished!") print("************************") file2.write(str(ep) + str(',') + str(epstep) + '\n') file2.close() f.close()