Exemple #1
0
 def step(self, action):
     a = play(self.S)
     a = map(float, a.split(","))
     self.S, r = simulate(self.S, [a[0], a[1], action * 0.01])
     # print "Reward",r
     if (len(self.S["White_Locations"]) == 0
             and len(self.S["Black_Locations"]) == 0
             and len(self.S["Red_Location"]) == 0):
         return vectorize_state(self.S), r + 20, True
     return vectorize_state(self.S), r, False
Exemple #2
0
 def step(self, action):
     self.S, r = simulate(self.S, [(action[0] + 1) / 2,
                                   (action[1] + 1) * 135 - 45,
                                   (action[2] + 1) / 2])
     print "Reward", r
     if (len(self.S["White_Locations"]) == 0
             and len(self.S["Black_Locations"]) == 0
             and len(self.S["Red_Location"]) == 0):
         return vectorize_state(self.S), r + 20, True
     return vectorize_state(self.S), r, False
Exemple #3
0
def agent_1player():
    maxGames = 50
    game = 1
    nActions = len(actions)

    while game <= maxGames:
        time_step = 0
        coins = ONE_PEICE
        state, reward = parse_state_message(coins, 0)

        maxTime = 500
        # print("Starting game.............", game)
        while time_step < maxTime:

            # get Q value for every action
            Qs = []
            scores = np.zeros(nActions)
            for a in range(nActions):
                X = np.array(state + actions[a])
                Q = forward(X)
                Qs.append(Q)
                scores[a] = Q

            # exp_scores = np.exp(scores)
            # # get probability distribution
            # print "exp scores sum", np.sum(exp_scores)
            probs = scores / np.sum(scores)
            cdf = [probs[0]]
            for i in range(1, len(probs)):
                cdf.append(cdf[-1] + probs[i])

            a_index_curr = bisect(cdf, random.random())
            # print 'prob sum', np.sum(probs)
            # for p in probs:
            #     print p
            # pick action
            # action_indices = np.arange(nActions)
            # a_index_curr = stats.rv_discrete(values=(action_indices, probs)).rvs()
            Qcurr = Qs[a_index_curr]
            action_picked = actions[a_index_curr]
            angle = -45 + action_picked[1] * 270
            action = [action_picked[0], angle, action_picked[2]]
            # print 'action', action

            # simulate
            coins, reward = one_step.simulate(coins,
                                              one_step.validate(action, coins))
            if gameEnd(coins):
                break
            nextState, reward = parse_state_message(coins, reward)
            # print(nextState, reward)

            # greedy for action of next state
            Qmax, a_index_next = -np.inf, -1
            for a in range(nActions):
                X = np.array(nextState + actions[a])
                Qnext = forward(X)
                if Qnext > Qmax:
                    a_index_next = a
                    Qmax = Qnext

            # update by sarsa equation
            update = reward + discount * Qmax - Qcurr

            # backpropogate
            X = np.array(state + actions[a_index_curr])
            X = X.reshape(1, nn_input_dim)
            backward(np.array([[update]]), X)

            # next step
            time_step = time_step + 1
            state = nextState

        print time_step
        game = game + 1

    model = {'W1': W1, 'W2': W2, 'b1': b1, 'b2': b2}
    np.save('model.npy', model)
    # Load
    read_dictionary = np.load('model.npy').item()
Exemple #4
0
def train(sess, actor, critic):

    # Set up summary Ops
    summary_ops, summary_vars = build_summaries()

    sess.run(tf.initialize_all_variables())
    writer = tf.train.SummaryWriter(SUMMARY_DIR, sess.graph)

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)

    for i in xrange(MAX_EPISODES):

        # s = env.reset()

        ep_reward = 0
        ep_ave_max_q = 0
        s = Utils.INITIAL_STATE
        no_coins_pocketed = 0

        for j in xrange(MAX_EP_STEPS):

            # if RENDER_ENV:
            #     env.render()

            # Added exploration noise
            snn = s["White_Locations"] + s["Black_Locations"] + s[
                "Red_Location"]
            snn = snn + [(0, 0)] * (19 - len(snn))
            snn = np.reshape(np.array(snn), (1, 38))
            # a = actor.predict(snn) + (1. / (1. + i + j))
            a = np.add(
                np.asarray(actor.predict(snn)),
                np.multiply((10. / (100. + 20 * i + j)) * random.random(),
                            np.asarray(actor.action_bound)))

            if a[0][0] > 1:
                a[0][0] = 1
            elif a[0][0] < 0:
                a[0][0] = 0

            if a[0][1] > 225:
                a[0][1] = 225
            elif a[0][1] < -45:
                a[0][1] = -45

            if a[0][2] > 1:
                a[0][2] = 1
            elif a[0][2] < 0:
                a[0][2] = 0

            curr_coins = s["White_Locations"] + s["Black_Locations"] + s[
                "Red_Location"]
            # if j==0:
            print "\n Episode number : " + str(i) + "   Step number:" + str(
                j) + "\n" + str(a[0]) + "\n" + "Coins left:" + str(
                    len(curr_coins))
            # print a[0]

            s2, r = simulate(s, validate(a[0], s))

            # if color == "White" :
            #     opp_color = "Black"
            # else:
            #     opp_color = "White"

            # my_targets = s2[color+"_Locations"]
            # opp_targets = s2[opp_color + "_Locations"]

            my_targets = s2["White_Locations"] + s2["Black_Locations"] + s2[
                "Red_Location"]
            # if j==0:
            #  print "my targets size = " + str(len(my_targets)) + " Reward = " + str(r)

            if len(my_targets) >= len(curr_coins):
                no_coins_pocketed += 1
            else:
                no_coins_pocketed = 0

            terminal = False

            if no_coins_pocketed > 50 and len(curr_coins) > 2:
                terminal = True
            elif no_coins_pocketed > 100 and len(curr_coins) <= 2:
                terminal = True

            if not my_targets:
                terminal = True

            if terminal:
                summary_str = sess.run(summary_ops,
                                       feed_dict={
                                           summary_vars[0]: ep_reward,
                                           summary_vars[1]:
                                           ep_ave_max_q / float(j)
                                       })

                writer.add_summary(summary_str, i)
                writer.flush()

                print '| Reward: %.2i' % int(ep_reward), " | Episode", i, \
                 '| Qmax: %.4f' % (ep_ave_max_q / float(j))
                break

            s2nn = s2["White_Locations"] + s2["Black_Locations"] + s2[
                "Red_Location"]
            s2nn = s2nn + [(0, 0)] * (19 - len(s2nn))
            s2nn = np.reshape(np.array(s2nn), (1, 38))

            # s2, r, terminal, info = env.step(a[0])

            replay_buffer.add(np.reshape(snn, (actor.s_dim,)), np.reshape(a, (actor.a_dim,)), r, \
             terminal, np.reshape(s2nn, (actor.s_dim,)))

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > MINIBATCH_SIZE:
                s_batch, a_batch, r_batch, t_batch, s2_batch = \
                 replay_buffer.sample_batch(MINIBATCH_SIZE)

                # Calculate targets
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

                y_i = []
                for k in xrange(MINIBATCH_SIZE):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + GAMMA * target_q[k])

                # Update the critic given the targets
                predicted_q_value, _ = critic.train(
                    s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)))

                ep_ave_max_q += np.amax(predicted_q_value)

                # Update the actor policy using the sampled gradient
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, grads[0])

                # Update target networks
                actor.update_target_network()
                critic.update_target_network()

            s = s2
            ep_reward += r
def agent_2player(state, color):

    flag = 1

    # print "color : ", color
    try:
        # print state
        state, reward = parse_state_message(state)  # Get the state and reward
        # print state, reward, type(state)
    except:
        pass

    # Assignment 4: your agent's logic should be coded here

    if not state:
        # print "\n\n\n\n\n Exiting \n\n\n\n\n"
        return 0

    print "\n\n\n####################\n MY AGENT \n####################\n"

    if color == "White":
        opp_color = "Black"
    else:
        opp_color = "White"

    target_coins = state[color + "_Locations"] + state["Red_Location"]
    queen = state["Red_Location"]
    opp_coins = state[opp_color + "_Locations"]

    n_neighbors_max = -50
    targets = []
    for coin in target_coins:
        temp = num_neighbors(coin, target_coins, opp_coins, queen)
        if temp > n_neighbors_max:
            n_neighbors_max = temp

    for coin in target_coins:
        if num_neighbors(coin, target_coins, opp_coins,
                         queen) == n_neighbors_max:
            targets.append(coin)

    # print "TARGETS", targets
    final_target = random.choice(targets)
    print "FINAL TARGET", final_target
    (x_loc, angle, force) = best_action(final_target, target_coins + opp_coins)

    position = float(x_loc - 170) / float(460)

    # Can be ignored for now
    a = str(position) + ',' + \
        str(angle) + ',' + str(force)

    print "Action taken : " + a + "\n\n"

    start_time = timeit.default_timer()
    (new_state, score_inc) = simulate(state, (x_loc, angle, force))
    elapsed = timeit.default_timer() - start_time
    print "time taken to simulate : " + str(elapsed) + "\n"

    print "\n\n##################################\n"

    try:
        s.send(a)
    except Exception as e:
        print "Error in sending:", a, " : ", e
        print "Closing connection"
        flag = 0

    return flag
Exemple #6
0
# 			for position in range(30, 73, 3):
# 				f.write(str(position/100) + ' ' + str(angle) + ' ' + str(force/100) + '\n')

# Simulate each of the moves above
state = Utils.INITIAL_STATE
print('Finding best action ...')
with open('allFirstMoves', 'r') as f:
    count = 0
    for line in f:
        count += 1
        if (count % 100 == 0):
            print(count, file=sys.stderr)

        [position, angle, force] = [float(x) for x in line[:-1].split(' ')]

        next_state, reward = one_step.simulate(
            state, one_step.validate([position, angle, force], state))

        white_pos = [(x[0], x[1]) for x in next_state["White_Locations"]]
        black_pos = [(x[0], x[1]) for x in next_state["Black_Locations"]]
        red_pos = [(x[0], x[1]) for x in next_state["Red_Location"]]

        print(str(position) + ' ' + str(angle) + ' ' + str(force) + ' ' +
              str(len(white_pos)) + ' ' + str(len(black_pos)) + ' ' +
              str(len(red_pos)),
              end=' ')
        for x in white_pos:
            print(str(x[0]) + ' ' + str(x[1]), end=' ')
        for x in black_pos:
            print(str(x[0]) + ' ' + str(x[1]), end=' ')
        for x in red_pos:
            print(str(x[0]) + ' ' + str(x[1]), end=' ')
Exemple #7
0
def agent_1player():
    maxGames = 50
    game = 1

    while game <= maxGames:
        time_step = 1
        coins2 = TWO_PEICE
        state, reward = parse_state_message(coins2, 0)

        maxTime = 500
        # print("Starting game.............", game)
        while time_step <= maxTime:
            # heuristics ...........
            coins = coins2["White_Locations"] + coins2[
                "Black_Locations"] + coins2["Red_Location"]
            queen = coins2["Red_Location"]
            num_coins = len(coins)
            striker_pos, striker_angle = -1, -1
            striker_force = -1
            queen_cut_shot_exists = False

            #if number of coins is less than 10 and cut shot to queen exists, attempt cut shot to queen
            if len(coins) <= 10 and len(queen) == 1:
                striker_pos, striker_angle, striker_force = cut_shot(
                    coins, [], 1, queen[0])
                if striker_pos > 0:
                    queen_cut_shot_exists = True

            #If number of coins is less than 5 and cut shot to queen does not exist, aim at the queen directly
            if not queen_cut_shot_exists and len(coins) <= 5 and len(
                    queen) == 1:
                striker_pos, striker_angle, striker_force = cut_shot(
                    coins, [], 0, queen[0])
                queen_cut_shot_exists = True

            # cluster begins here
            if not queen_cut_shot_exists:
                pos, angle, cluster_size, force = select_cluster_1p(coins, 0)
                if (pos < 0):
                    pos, angle, cluster_size, force = select_cluster_1p(
                        coins, 1)
                if (pos >= 0):
                    striker_pos, striker_angle = pos, angle
                    striker_force = force
                    # print "Cluster found close to pocket!"
                else:
                    striker_pos, striker_angle, striker_force = cut_shot(
                        coins, [], 1)
                    if striker_pos < 0:
                        # print "Closest Coin hit"
                        striker_pos, striker_angle, striker_force = cut_shot(
                            coins, [], 0)
                    # else:
                    #     print "cutshot"

            striker_pos = (striker_pos - 170.0) / 460.0
            striker_angle = (striker_angle + 45) / 270.0

            # heuristics end ..........

            #  generate action space
            actions = []
            xmin = int((-1) * min(5, (striker_pos / 0.05)))
            xmax = int(min(5, ((1 - striker_pos) / 0.05)))
            anglemin = int((-1) * min(5, striker_angle / 0.02))
            anglemax = int(min(5, ((1 - striker_angle) / 0.02)))
            forcemin = int((-1) * min(5, striker_force / 0.02))
            forcemax = int(min(5, ((1 - striker_force) / 0.02)))
            for x in range(xmin, xmax):
                for angle in range(anglemin, anglemax):
                    for force in range(forcemin, forcemax):
                        actions.append([
                            striker_pos + 0.05 * x,
                            striker_angle + 0.02 * angle,
                            striker_force + 0.02 * force
                        ])
            nActions = len(actions)

            # get Q value for every action
            Qs = []
            scores = np.zeros(nActions)
            for a in range(nActions):
                X = np.array(state + actions[a])
                Q = forward(X)
                Qs.append(Q)
                scores[a] = Q

            probs = scores / np.sum(scores)
            cdf = [probs[0]]
            for i in range(1, len(probs)):
                cdf.append(cdf[-1] + probs[i])

            a_index_curr = bisect(cdf, random.random())
            Qcurr = Qs[a_index_curr]
            action_picked = actions[a_index_curr]
            angle = -45 + action_picked[1] * 270
            action = [action_picked[0], angle, action_picked[2]]
            # print 'action', action

            # simulate
            coins2, reward = one_step.simulate(
                coins2, one_step.validate(action, coins2))
            if gameEnd(coins2):
                break
            nextState, reward = parse_state_message(coins2, reward)

            if reward <= 0:
                reward = reward + max(-0.25 * pow(1.1, time_step), -2)

            # greedy for action of next state
            Qmax, a_index_next = -np.inf, -1
            for a in range(nActions):
                X = np.array(nextState + actions[a])
                Qnext = forward(X)
                if Qnext > Qmax:
                    a_index_next = a
                    Qmax = Qnext

            # update by sarsa equation
            update = reward + discount * Qmax - Qcurr

            # backpropogate
            X = np.array(state + actions[a_index_curr])
            X = X.reshape(1, nn_input_dim)
            backward(np.array([[update]]), X)

            # next step
            time_step = time_step + 1
            state = nextState

        print time_step
        game = game + 1

    model = {'W1': W1, 'W2': W2, 'b1': b1, 'b2': b2}
    np.save('model.npy', model)
    # Load
    read_dictionary = np.load('model.npy').item()
Exemple #8
0
def trainNetwork(model, mode):
    # open up a game state to communicate with emulator
    # game_state = game.GameState()

    # store the previous observations in replay memory

    # get the first state by doing nothing and preprocess the image to 80x80x4
    # do_nothing = np.zeros(ACTIONS)
    # do_nothing[0] = 1
    # x_t, r_0, terminal = game_state.frame_step(do_nothing)

    # x_t = skimage.color.rgb2gray(x_t)
    # x_t = skimage.transform.resize(x_t,(80,80))
    # x_t = skimage.exposure.rescale_intensity(x_t,out_range=(0,255))

    # s_t = np.stack((x_t, x_t, x_t, x_t), axis=0)

    #In Keras, need to reshape
    # s_t = s_t.reshape(1, s_t.shape[0], s_t.shape[1], s_t.shape[2])

    # state = np.zeros([input_dimension])
    # whites = stateDict["White_Locations"]
    # blacks = stateDict["Black_Locations"]
    # rednecks = stateDict["Red_Location"]
    # for w in whites+blacks+rednecks:
    # 	x = w[0]//50
    # 	y = w[1]//50
    # 	state[16*x+y] +=1
    # # state= state.T
    # s_t = state
    # s_t = s_t.reshape(1,s_t.shape[0])

    # print(state)
    # print ("shape", state.shape)
    if mode == 'Run':
        OBSERVE = 999999999  #We keep observe, never train
        epsilon = FINAL_EPSILON
        print("Now we load weight")
        model.load_weights("model.h5")
        adam = Adam(lr=1e-6)
        model.compile(loss='mse', optimizer=adam)
        print("Weight load successfully")
    else:  #We go to training mode
        OBSERVE = OBSERVATION
        epsilon = INITIAL_EPSILON
    f = open("temp.csv", 'w')

    t = 0
    MAX_EPISODES = 10000
    MAX_EPISODE_STEPS = 500
    file2 = open("ep_end_times.txt", 'w')
    for ep in xrange(0, MAX_EPISODES):
        stateDict = Utils.INITIAL_STATE
        s_t = getStateFromDict(stateDict)
        # state = s_t

        for epstep in xrange(0, MAX_EPISODE_STEPS):
            loss = 0
            Q_sa = 0
            action_index = 0
            r_t = 0
            a_t = np.zeros([ACTIONS])
            #choose an action epsilon greedy
            if t % FRAME_PER_ACTION == 0:
                if random.random() <= epsilon:
                    print("----------Random Action----------")
                    action_index = random.randrange(ACTIONS)
                    a_t[action_index] = 1
                else:
                    # print ("st0 " + str(s_t[0]) )
                    # print ("st shape ",s_t.shape)
                    q = model.predict(
                        s_t)  #input a stack of 4 images, get the prediction
                    max_Q = np.argmax(q)
                    action_index = max_Q
                    a_t[max_Q] = 1

            #We reduced the epsilon gradually
            if epsilon > FINAL_EPSILON and t > OBSERVE:
                epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

            #run the selected action and observed next state and reward
            # x_t1_colored, r_t, terminal = game_state.frame_step(a_t)

            # TODO : implement this
            # next_state, reward = play(stateDict, action_index)
            my_action = getActionFromIndex(action_index)
            next_state_dict, reward = simulate(stateDict,
                                               validate(my_action, stateDict))
            next_state = getStateFromDict(next_state_dict)
            w = len(next_state_dict["White_Locations"])
            b = len(next_state_dict["Black_Locations"])
            r = len(next_state_dict["Red_Location"])
            print("COINS : ", w, b, r, w + b + r)
            # x_t1 = skimage.color.rgb2gray(x_t1_colored)
            # x_t1 = skimage.transform.resize(x_t1,(80,80))
            # x_t1 = skimage.exposure.rescale_intensity(x_t1, out_range=(0, 255))

            # x_t1 = x_t1.reshape(1, 1, x_t1.shape[0], x_t1.shape[1])
            # s_t1 = np.append(x_t1, s_t[:, :3, :, :], axis=1)

            # store the transition in D
            D.append((s_t, action_index, reward, next_state))
            if len(D) > REPLAY_MEMORY:
                D.popleft()

            #only train if done observing
            if t > OBSERVE:
                # if t > 40:
                # sample a minibatch to train on
                minibatch = random.sample(D, BATCH)

                # inputs = np.zeros((BATCH, s_t.shape[1], s_t.shape[2], s_t.shape[3]))   #32, 80, 80, 4
                inputs = np.zeros((BATCH, s_t.shape[1]))  #32, 80, 80, 4
                targets = np.zeros((BATCH, ACTIONS))  #32, 2

                #Now we do the experience replay
                for i in range(0, len(minibatch)):
                    state_t = minibatch[i][0]
                    action_t = minibatch[i][1]  #This is action index
                    reward_t = minibatch[i][2]
                    state_t1 = minibatch[i][3]
                    # terminal = minibatch[i][4]
                    # if terminated, only equals reward

                    # inputs[i] = state_t    #I saved down s_t
                    inputs[i:i + 1] = state_t  #I saved down s_t

                    targets[i] = model.predict(
                        state_t)  # Hitting each buttom probability
                    Q_sa = model.predict(state_t1)

                    terminal = False
                    if state_t1.sum() < 1: terminal = True
                    if terminal:
                        print(
                            "*************************************Terminal*************************************"
                        )
                        targets[i, action_t] = reward_t
                    else:
                        targets[i, action_t] = reward_t + GAMMA * np.max(Q_sa)

                # targets2 = normalize(targets)
                loss += model.train_on_batch(inputs, targets)
            stateDict = next_state_dict
            s_t = next_state
            t = t + 1

            # save progress every 10000 iterations
            if t % 100 == 0:
                print("Now we save model")
                model.save_weights("model.h5", overwrite=True)
                with open("model.json", "w") as outfile:
                    json.dump(model.to_json(), outfile)

            # print info
            state_ = ""
            if t <= OBSERVE:
                state_ = "observe"
            elif t > OBSERVE and t <= OBSERVE + EXPLORE:
                state_ = "explore"
            else:
                state_ = "train"

            print("EPISODE", ep, "/ EP_STEP ", epstep, "TIMESTEP", t , "/ STATE_", state_, \
             "/ EPSILON", epsilon, "/ ACTION", action_index, "/ My Action",my_action, "/ REWARD", reward, \
             "/ Q_MAX " , np.max(Q_sa), "/ Loss ", loss)
            comma = str(',')
            f.write(
                str(t) + comma + str(reward) + comma + str(np.max(Q_sa)) +
                '\n')
            if (w + b + r) == 0:
                break

        print("Episode finished!")
        print("************************")
        file2.write(str(ep) + str(',') + str(epstep) + '\n')
    file2.close()
    f.close()