Example #1
0
    def __init__(self):
        self.game = carmunk.GameState()
        self.episodes_length = 10000
        self.nn = NN(7, 3)
        self.gamma = 0.9

        # Generate the necessary tensorflow ops
        self.inputs1, self.nextQ = self.nn.placeholder_inputs(None)
        self.Qout = self.nn.inference(self.inputs1, 128, 32)
        self.loss = self.nn.loss_val(self.Qout, self.nextQ)
        self.train_op = self.nn.training(self.loss, learning_rate=0.01)

        self.time_per_epoch = tf.placeholder(tf.float32, shape=())
        self.init = tf.initialize_all_variables()
        self.saver = tf.train.Saver()

        # Generate the requisite buffer
        self.experience_memory = 10000
        self.replay = []

        self.minibatch_size = 128
        self.epsilon = 0.9

        # self.saver.restore(self.sess, "newmodel1.ckpt")
        self.logs_path = '/tmp/tensorflow_logs/example21'

        # Create a summary to monitor loss tensor
        tf.scalar_summary("timeperepoch", self.time_per_epoch)

        # Merge all summaries into a single op
        self.merged_summary_op = tf.merge_all_summaries()
Example #2
0
def play(model, weights):

    car_distance = 0
    game_state = carmunk.GameState(weights)

    _, state, __ = game_state.frame_step((2))

    featureExpectations = np.zeros(len(weights))

    # Move. dib book
    #time.sleep(15)
    while True:
        time.sleep(0.01)
        car_distance += 1

        # Choose action.
        action = (np.argmax(model.predict(state, batch_size=1)))
        #print ("Action ", action)

        # Take action.
        immediateReward, state, readings = game_state.frame_step(action)
        #print ("immeditate reward:: ", immediateReward)
        #print ("readings :: ", readings)
        #start recording feature expectations only after 100 frames
        if car_distance > 100:
            featureExpectations += (GAMMA**(car_distance -
                                            101)) * np.array(readings)
        #print ("Feature Expectations :: ", featureExpectations)
        # Tell us something.
        if car_distance % 2000 == 0:
            print("Current distance: %d frames." % car_distance)
            break

    return featureExpectations
Example #3
0
def play(model):

    car_distance = 0
    game_state = carmunk.GameState()
    four_states = [[None] * 3] * 4

    # Do nothing to get initial.
    _, start_state = game_state.frame_step((2))

    rotate_state(four_states, start_state)

    # Move.
    while True:
        car_distance += 1
        flat_four = np.array([flatten_state(four_states)])

        # Choose action.
        action = (np.argmax(model.predict(flat_four, batch_size=1)))

        # Take action.
        _, state = game_state.frame_step(action)

        rotate_state(four_states, state)
        # Tell us something.
        if car_distance % 1000 == 0:
            print("Current distance: %d frames." % car_distance)
Example #4
0
def play(model):

    car_distance = 0
    game_state = carmunk.GameState()

    # Do nothing to get initial.
    _, state = game_state.frame_step((1))

    # Move.
    while True:
        car_distance += 1

        # Choose action.
        #action = (np.argmax(model.predict(state, batch_size=1)))

        # Take action.

        if random.random() < 0.099:
            action = np.random.randint(3)  # random
        else:
            # Get Q values for each action.
            qval = model.predict(state, batch_size=1)
            action = (np.argmax(qval))  # best

        _, state = game_state.frame_step(action)

        # Tell us something.
        if car_distance % 1000 == 0:
            print("Current distance: %d frames." % car_distance)
Example #5
0
def play(screen):
    sess = tf.InteractiveSession()
    saved_model = 'saved-models_brown/evaluatedPolicies/1-164-150-100-50000-100000.h5'
    model = Policy_Network(NUM_STATES, [164, 150], sess, saved_model)

    car_distance = 0
    weights = [
        -0.26275824, 0.03635492, 0.09312051, 0.00469211, -0.18295909,
        0.6987476, -0.59225824, -0.2201157
    ]  #brown
    # weights = [-0.06099233, -0.20316265, -0.1427778,  -0.16924885,  0.25280695, -0.0025343, 0.30678838, -0.86483369]
    # weights = [1, 1, 1, 1, 1, 1, 1, 1]# just some random weights, does not matter in calculation of the feature expectations
    game_state = carmunk.GameState(weights, [0, 0, 1, 0])
    _, state, __ = game_state.frame_step((2))
    featureExpectations = np.zeros(len(weights))
    Prev = np.zeros(len(weights))
    replay = []
    while True:
        car_distance += 1
        event = screen.getch()

        if event == curses.KEY_LEFT:
            action = 1
        elif event == curses.KEY_RIGHT:
            action = 0
        elif event == curses.KEY_DOWN:
            break
        else:
            action = 2

        # Take action.
        #start recording feature expectations only after 100 frames
        immediateReward, new_state, readings = game_state.frame_step(action)
        replay.append((state, action, immediateReward, new_state))
        state = new_state

        if car_distance > 100:
            featureExpectations += (GAMMA**(car_distance -
                                            101)) * np.array(readings)

        # Tell us something.
        changePercentage = (np.linalg.norm(featureExpectations - Prev) *
                            100.0) / np.linalg.norm(featureExpectations)

        print(car_distance)
        print("percentage change in Feature expectation ::", changePercentage)
        Prev = np.array(featureExpectations)

        if car_distance % 300 == 0:
            break

    Xtrain, Ytrain = process_minibatch(replay, model)
    np.save('xtrain_brown.npy', Xtrain)
    np.save('ytrain_brown.npy', Ytrain)
    return featureExpectations
Example #6
0
def play(screen):
    car_distance = 0
    weights = [
        1, 1, 1, 1, 1, 1, 1, 1
    ]  # just some random weights, does not matter in calculation of the feature expectations
    game_state = carmunk.GameState(weights)
    _, state, __ = game_state.frame_step((2))
    featureExpectations = np.zeros(len(weights))
    Prev = np.zeros(len(weights))
    while True:
        car_distance += 1
        event = screen.getch()

        if event == curses.KEY_LEFT:
            action = 1
        elif event == curses.KEY_RIGHT:
            action = 0
        elif event == curses.KEY_DOWN:
            break
        else:
            action = 2

        # Take action.
        #start recording feature expectations only after 100 frames
        immediateReward, state, readings = game_state.frame_step(action)
        if car_distance > 100:
            featureExpectations += (GAMMA**(car_distance -
                                            101)) * np.array(readings)

        # Tell us something.
        changePercentage = (np.linalg.norm(featureExpectations - Prev) *
                            100.0) / np.linalg.norm(featureExpectations)

        print(car_distance)
        print("percentage change in Feature expectation ::", changePercentage)
        Prev = np.array(featureExpectations)

        if car_distance % 2000 == 0:
            break

    return featureExpectations
Example #7
0
def play(model):

    car_distance = 0
    game_state = carmunk.GameState()

    # Do nothing to get initial.
    _, state = game_state.frame_step((2))

    # Move.
    while True:
        car_distance += 1

        # Choose action.
        action = (np.argmax(model.predict(state, batch_size=1)))

        # Take action.
        _, state = game_state.frame_step(action)

        # Tell us something.
        if car_distance % 1000 == 0:
            print("Current distance: %d frames." % car_distance)
Example #8
0
def play(model):

    car_distance = 0
    game_state = carmunk.GameState()

    # Do nothing to get initial.
    state, _, speed, _, _, _ = game_state.frame_step(START_ACTION, START_SPEED, START_DISTANCE)

    # Move.
    while True:
        car_distance += 1
        
        # Choose action.
        action = (np.argmax(model.predict(state, batch_size=1)))
        
        # Take action.
        state, _, speed, _, _, _ = game_state.frame_step(action, speed, car_distance)
        
        # Tell us something.
        if car_distance % 1000 == 0:
            print("Current distance: %d frames." % car_distance)
def play(model):

    car_distance = 0
    game_state = carmunk.GameState()

    # Do nothing to get initial.
    reward, state = game_state.frame_step((2))

    # Change this to "whilte True" to make it never die.
    while reward != -500:
        car_distance += 1

        # Choose action.
        action = (np.argmax(model.predict(state, batch_size=1)))

        # Take action.
        reward, state = game_state.frame_step(action)

        # Tell us something.
        if car_distance % 1000 == 0:
            print("Current distance: %d frames." % car_distance)

    print("Made it %d frames." % car_distance)
Example #10
0
def play(model, weights, sess=None):

    car_distance = 0
    game_state = carmunk.GameState(weights, [1, 0, 0, 0])

    _, state, __ = game_state.frame_step((2))
    # state = state + [1,0,0,0]

    featureExpectations = np.zeros(len(weights))

    # Move.
    #time.sleep(15)
    while True:
        car_distance += 1

        # Choose action.
        action = (np.argmax(model.predict(state)))
        # F = compute_fisher(model, [state], sess)
        # print(F)
        #print ("Action ", action)

        # Take action.
        immediateReward, state, readings = game_state.frame_step(action)
        # state = state + [1,0,0,0]
        #print ("immeditate reward:: ", immediateReward)
        #print ("readings :: ", readings)
        #start recording feature expectations only after 100 frames
        if car_distance > 100:
            featureExpectations += (GAMMA**(car_distance -
                                            101)) * np.array(readings)
        #print ("Feature Expectations :: ", featureExpectations)
        # Tell us something.
        if car_distance % 2000 == 0:
            print("Current distance: %d frames." % car_distance)
            break

    return featureExpectations
Example #11
0
def train_net(model, params):

    filename = params_to_filename(params)

    observe = 1000  # Number of frames to observe before training.
    epsilon = 1
    train_frames = 110000  # Number of frames to play.
    steps = 0
    batchSize = params['batchSize']
    buffer = params['buffer']

    # Just stuff used below.
    max_car_distance = 0
    car_distance = 0
    t = 0
    data_collect = []
    replay = []  # stores tuples of (S, A, R, S').

    loss_log = []

    # Create a new game instance.
    game_state = carmunk.GameState()

    # Get initial state by doing nothing and getting the state.
    _, state = game_state.frame_step((1))

    # Let's time it.
    start_time = timeit.default_timer()

    # Run the frames.
    while t < train_frames:

        t += 1
        car_distance += 1

        # Choose an action.
        if random.random() < epsilon or t < observe:
            action = np.random.randint(3)  # random
        else:
            # Get Q values for each action.
            qval = model.predict(state, batch_size=1)
            action = (np.argmax(qval))  # best

        # Take action, observe new state and get our treat.
        reward, new_state = game_state.frame_step(action)

        # Experience replay storage.
        replay.append((state, action, reward, new_state))

        # If we're done observing, start training.
        if t > observe:
            #print("start")
            # If we've stored enough in our buffer, pop the oldest.
            if len(replay) > buffer:
                replay.pop(0)

            # Randomly sample our experience replay memory
            minibatch = random.sample(replay, batchSize)

            # Get training values.
            X_train, y_train = process_minibatch2(minibatch, model)

            # Train the model on this batch.
            history = LossHistory()
            model.fit(X_train,
                      y_train,
                      batch_size=batchSize,
                      nb_epoch=1,
                      verbose=0,
                      callbacks=[history])
            loss_log.append(history.losses)
            steps += 1
            if steps % 1000 == 0:
                print("Step = " + str(steps), "Epsilon = " + str(epsilon))
        # Update the starting state with S'.
        state = new_state

        # Decrement epsilon over time.
        if epsilon > 0.1 and t > observe:
            epsilon -= (1.0 / train_frames)

        # We died, so update stuff.
        if reward <= -500:
            #print("Crashed.")
            # Log the car's distance at this T.
            data_collect.append([t, car_distance])

            # Reset.
            car_distance = 0
        # We reached the goal, so update stuff.
        elif reward >= 2000:
            print("Reached goal.")
            # Log the car's distance at this T.
            data_collect.append([t, car_distance])

            # Reset.
            car_distance = 0

        # Save the model every 25,000 frames.
        if t % 25000 == 0:
            model.save_weights('saved-models/' + filename + '-' + str(t) +
                               '.h5',
                               overwrite=True)
            print("Saving model %s - %d" % (filename, t))
Example #12
0
def train_net(model, params):

    filename = params_to_filename(params)

    observe = 1000  # Number of frames to observe before training.
    epsilon = 1
    train_frames = 100000  # Number of frames to play.
    batchSize = params['batchSize']
    buffer = params['buffer']

    # Just stuff used below.
    max_car_distance = 0
    car_distance = 0
    min_distance = 10000
    t = 0
    data_collect = []
    replay = []  # stores tuples of (S, A, R, S').

    loss_log = []

    # Create a new game instance.
    game_state = carmunk.GameState()

    # Get initial state by doing nothing and getting the state.
    _, state,_= game_state.frame_step((2))

    # Let's time it.
    start_time = timeit.default_timer()

    # Run the frames.
    while t < train_frames:

        t += 1
        car_distance += 1

        # Choose an action.
        # if random.random() < epsilon or t < observe:
        #     action = np.random.randint(0, 3)  # random
        # else:
            # Get Q values for each action.
        qval = model.predict(state, batch_size=1)
        action = (np.argmax(qval))  # best

        # Take action, observe new state and get our treat.
        reward, new_state, distance = game_state.frame_step(action)

        # Experience replay storage.
        replay.append((state, action, reward, new_state))

        # If we're done observing, start training.
        if t > observe:

            # If we've stored enough in our buffer, pop the oldest.
            if len(replay) > buffer:
                replay.pop(0)

            # Randomly sample our experience replay memory
            minibatch = random.sample(replay, batchSize)   # mot liss gom 64 tuble, moi tuble gom 4 phan tu (S, A, R, S')

            # Get training values.
            X_train, y_train = process_minibatch2(minibatch, model)

            # Train the model on this batch.
            history = LossHistory()
            model.fit(
                X_train, y_train, batch_size=batchSize,
                nb_epoch=1, verbose=0, callbacks=[history]
            )
            loss_log.append(history.losses)

        # Update the starting state with S'.
        state = new_state

        # Decrement epsilon over time.
        if epsilon > 0.1 and t > observe:
            epsilon -= (1.0/train_frames)

        if distance < min_distance:
            min_distance = distance

        # We died, so update stuff.
        if reward == -500:
            # Log the car's distance at this T.
            data_collect.append([t, car_distance])

            # Update max.
            if car_distance > max_car_distance:
                max_car_distance = car_distance

            # Time it.
            tot_time = timeit.default_timer() - start_time
            fps = car_distance / tot_time

            # Output some stuff so we can watch.
            # print("Max_car_distance: %d at %d\tepsilon %f\t(%d)\tdistance %d\t%f fps" %
            #       (max_car_distance, t, epsilon, car_distance, distance, fps))

            # Reset.
            car_distance = 0
            start_time = timeit.default_timer()
        if t % 10 == 0:
            print("Max_car_distance: %d at %d\tepsilon %f\t(%d)\tdistance %d \tmin_distance %d" %
                 (max_car_distance, t, epsilon, car_distance, distance, min_distance))

        # Save the model every 25,000 frames.
        if t % 10000 == 0:
            model.save_weights('saved-models/' + filename + '-' +
                               str(t) + '.h5',
                               overwrite=True)
            print("Saving model %s - %d" % (filename, t))
        
    # Log results after we're done all frames.
    log_results(filename, data_collect, loss_log)
def train_net(model):

    observe = 1000  # Number of frames to observe before training.
    epochs = 1000  # Number of games to play.
    epsilon = 1
    batchSize = 40
    # buffer = 50000
    buffer = 5000

    # Just stuff used below.
    max_car_distance = 0
    t = 0
    data_collect = []
    replay = []  # stores tuples of (S, A, R, S').

    for i in range(epochs):
        # Create a new game instance.
        game_state = carmunk.GameState()
        status = 1
        # Get initial state by doing nothing and getting the state.
        _, state = game_state.frame_step((2))

        car_distance = 0  # Reset.

        while status == 1:
            t += 1
            car_distance += 1

            # Get Q values for each action.
            qval = model.predict(state, batch_size=1)
            # Choose an action.
            if random.random() < epsilon or t < observe:
                action = np.random.randint(0, 3)  # random
            else:
                action = (np.argmax(qval))  # best

            # Take action, observe new state and get our treat.
            reward, new_state = game_state.frame_step(action)

            # Experience replay storage.
            replay.append((state, action, reward, new_state))

            # If we're done observing, start training.
            if t > observe:

                # If we've stored enough in our buffer, pop the oldest.
                if len(replay) > buffer:
                    replay.pop(0)

                # Randomly sample our experience replay memory
                minibatch = random.sample(replay, batchSize)

                # Get training values.
                X_train, y_train = process_minibatch(minibatch)

                # Train the model on this batch.
                model.fit(X_train,
                          y_train,
                          batch_size=batchSize,
                          nb_epoch=1,
                          verbose=0)

            # Update the starting state with S'.
            state = new_state

            # We died, so update stuff.
            if reward == -500:
                status = 0
                if car_distance > max_car_distance:
                    max_car_distance = car_distance

                    # Save the model.
                    model.save_weights('saved-models/model-weights-' +
                                       str(car_distance) + '.h5',
                                       overwrite=True)

        # Decrement epsilon over time.
        if epsilon > 0.1 and t > observe:
            epsilon -= (1 / epochs)

        # Log the car's distance at this T.
        data_collect.append([t, car_distance])
        print("Max: %d at %d\tgame %d\tepsilon %f\t(%d)" %
              (max_car_distance, t, i, epsilon, car_distance))

    # Save the results to a file so we can graph it later.
    data_dump = open('results/learn_data-' + str(t) + '.csv', 'w')
    wr = csv.writer(data_dump)
    wr.writerows(data_collect)

    # Save a last version of the model.
    model.save_weights('saved-models/model-weights-' + str(t) + '.h5',
                       overwrite=True)
def train_net(model, params):
    global counter
    global lastState
    global last_action
    global lastreward
    filename = params_to_filename(params)

    observe = 1000  # Number of frames to observe before training.p
    epsilon = 1
    train_frames = 1000000  # Number of frames to play.
    batchSize = params['batchSize']
    buffer = params['buffer']

    # Just stuff used below.
    max_car_distance = 0
    car_distance = 0
    t = 0
    data_collect = []
    replay = []  # stores tuples of (S, A, R, S').

    loss_log = []

    # Create a new game instance.
    game_state = carmunk.GameState()

    # Get initial state by doing nothing and getting the state.
    _, state = game_state.frame_step((2))
    # state = np.array([14,14,14,14,14,14,14,14,14])
    # state = np.expand_dims(state, axis = 0)
    # Let's time it.
    start_time = timeit.default_timer()

    # Run the frames.
    while t < train_frames:

        t += 1
        car_distance += 1

        # Choose an action.
        if random.random() < epsilon or t < observe:
            action = np.random.randint(0, 5)  # random
        else:
            # Get Q values for each action.
            qval = model.predict(train_new_state, batch_size=1)
            action = (np.argmax(qval))  # best

        # Take action, observe new state and get our treat.
        if lastreward < -100:
            lastState = state
        train_state = np.append(lastState, state[0])
        train_state = np.append(train_state, last_action)
        train_state = np.expand_dims(train_state, axis=0)

        reward, new_state = game_state.frame_step(action)
        train_new_state = np.append(state[0], new_state[0])
        train_new_state = np.append(train_new_state, action)

        train_new_state = np.expand_dims(train_new_state, axis=0)

        if sum(state[0]) >= 42:
            counter += 1
            if counter % 40 == 0:
                replay.append((train_state, action, reward, train_new_state))
                if counter > 1000000000:
                    counter = 0
        else:
            replay.append((train_state, action, reward, train_new_state))

        lastState = np.copy(state)
        state = np.copy(new_state)
        # Experience replay storage.
        last_action = action
        # If we're done observing, start training.
        if t > observe:

            # If we've stored enough in our buffer, pop the oldest.
            if len(replay) > buffer:
                replay.pop(0)

            # Randomly sample our experience replay memory
            minibatch = random.sample(replay, batchSize)

            # Get training values.batchSize
            X_train, y_train = process_minibatch(minibatch, model)

            # Train the model on this batch.
            history = LossHistory()
            batchSize1 = len(X_train)
            model.fit(X_train,
                      y_train,
                      batch_size=batchSize1,
                      nb_epoch=1,
                      verbose=0,
                      callbacks=[history])
            loss_log.append(history.losses)

        # Update the starting state with S'.

        # Decrement epsilon over time.
        if epsilon > 0.1 and t > observe:
            epsilon -= 5 * (1 / train_frames)

        # We died, so update stuff.
        lastreward = reward
        if reward == -500:
            # Log the car's distance at this T.
            data_collect.append([t, car_distance])

            # Update max.
            if car_distance > max_car_distance:
                max_car_distance = car_distance

            # Time it.
            tot_time = timeit.default_timer() - start_time
            fps = car_distance / tot_time

            # Output some stuff so we can watch.
            print("Max: %d at %d\tepsilon %f\t(%d)\t%f fps" %
                  (max_car_distance, t, epsilon, car_distance, fps))

            # Reset.
            car_distance = 0
            start_time = timeit.default_timer()

        # Save the model every 25,000 frames.
        if t % 10000 == 0:
            model.save_weights('saved-models/' + filename + '-' + str(t) +
                               '.h5',
                               overwrite=True)
            print("Saving model %s - %d" % (filename, t))

    # Log results after we're done all frames.
    log_results(filename, data_collect, loss_log)
Example #15
0
    reward_sum = 0
    episode_number = 0

    policy_network = PolicyNetwork(learning_rate)

    # saver
    saver = tf.train.Saver()
    # session
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    if resume:
        saver.restore(sess, model_path)

    # create a new game instance
    env = carmunk.GameState()
    done = False

    # get initial state by doing nothing and getting the state
    _, observation = env.frame_step(2)

    while episode_number < max_episode_number:

        current_state = observation
        #print (current_state)

        # forward the policy network and sample an action from the returned probability
        action_prob = policy_network.predict(current_state[np.newaxis, :],
                                             sess)
        action = np.random.choice(a=3, p=action_prob.ravel())
def train_net(turn_model, turn_model_30, turn_model_50, turn_model_70,
              avoid_model, acquire_model, acquire_model_30, acquire_model_50,
              acquire_model_70, hunt_model, pack_model, params):

    filename = params_to_filename(params)

    if cur_mode in [TURN, HUNT, PACK]:
        observe = 2000  # Number of frames to observe before training.
    else:
        observe = 2000

    epsilon = 1  # vary this based on pre-learning already occurred in lower models
    train_frames = 750000  # number of flips for training
    batchSize = params['batchSize']
    buffer = params['buffer']

    # initialize variables and structures used below.
    max_crash_frame_ctr = 0
    crash_frame_ctr = 0
    total_frame_ctr = 0
    replay_frame_ctr = 0
    stop_ctr = 0
    avoid_ctr = 0
    acquire_ctr = 0
    cum_rwd = 0
    cum_speed = 0

    data_collect = []
    replay = []
    loss_log = []  # replay stores state, action, reward, new state
    save_init = True
    cur_speeds = []
    for i in range(NUM_DRONES):
        cur_speeds.append(START_SPEED)

    # initialize drone state holders
    turn_states = np.zeros(
        [NUM_DRONES, TURN_TOTAL_SENSORS * TURN_STATE_FRAMES])
    avoid_states = np.zeros(
        [NUM_DRONES, AVOID_TOTAL_SENSORS * AVOID_STATE_FRAMES])
    acquire_states = np.zeros(
        [NUM_DRONES, ACQUIRE_NUM_SENSOR * ACQUIRE_STATE_FRAMES])
    hunt_states = np.zeros(
        [NUM_DRONES, HUNT_TOTAL_SENSORS * HUNT_STATE_FRAMES])
    drone_states = np.zeros(
        [NUM_DRONES, DRONE_TOTAL_SENSOR * PACK_STATE_FRAMES])

    # create game instance
    game_state = carmunk.GameState()

    # get initial state(s)
    turn_state, avoid_state, acquire_state, hunt_state, drone_state, reward, cur_speed = \
        game_state.frame_step(START_DRONE_ID, START_TURN_ACTION, START_SPEED_ACTION,
                              START_PACK_ACTION, START_SPEED, START_DISTANCE, 1)

    # initialize frame states
    if cur_mode in [TURN, AVOID, HUNT, PACK]:

        for i in range(NUM_DRONES):
            turn_states[i] = state_frames(
                turn_state,
                np.zeros((1, TURN_TOTAL_SENSORS * TURN_STATE_FRAMES)),
                TURN_TOTAL_SENSORS, TURN_STATE_FRAMES)

        if cur_mode in [AVOID, HUNT, PACK]:

            for i in range(NUM_DRONES):
                avoid_states[i] = state_frames(
                    avoid_state,
                    np.zeros((1, AVOID_TOTAL_SENSORS * AVOID_STATE_FRAMES)),
                    AVOID_TOTAL_SENSORS, AVOID_STATE_FRAMES)

    if cur_mode in [ACQUIRE, HUNT, PACK]:

        for i in range(NUM_DRONES):
            acquire_states[i] = state_frames(
                acquire_state,
                np.zeros((1, ACQUIRE_NUM_SENSOR * ACQUIRE_STATE_FRAMES)),
                ACQUIRE_NUM_SENSOR, ACQUIRE_STATE_FRAMES)

    if cur_mode in [HUNT, PACK]:

        for i in range(NUM_DRONES):
            hunt_states[i] = state_frames(
                hunt_state,
                np.zeros((1, HUNT_TOTAL_SENSORS * HUNT_STATE_FRAMES)),
                HUNT_TOTAL_SENSORS, HUNT_STATE_FRAMES)

    if cur_mode == PACK:

        for i in range(NUM_DRONES):
            drone_states[i] = state_frames(
                drone_state,
                np.zeros((1, DRONE_TOTAL_SENSOR * PACK_STATE_FRAMES)),
                DRONE_TOTAL_SENSOR, PACK_STATE_FRAMES)

        pack_state = state_frames(
            drone_state, np.zeros((1, PACK_TOTAL_SENSORS * PACK_STATE_FRAMES)),
            PACK_TOTAL_SENSORS, PACK_STATE_FRAMES)

    # time it
    start_time = timeit.default_timer()

    # run frames
    while total_frame_ctr < train_frames:

        total_frame_ctr += 1  # counts total training distance traveled
        crash_frame_ctr += 1  # counts distance between crashes
        replay_frame_ctr += 1  # counts frames between pack mode replay captures

        # used to slow things down for de-bugging
        #time.sleep(0.25)

        for drone_id in range(
                NUM_DRONES):  # NUM_DRONES = 1, unless you're in PACK mode

            speed_action = START_SPEED_ACTION

            # choose appropriate action(s)
            # note: only generates random inputs for currently training model.
            # all prior (sub) models provide their best (fully-trained) inputs
            if random.random(
            ) < epsilon or total_frame_ctr < observe:  # epsilon degrades over flips...
                if cur_mode == TURN:
                    turn_action = set_turn_action(
                        True, cur_speeds[drone_id],
                        np.array([turn_states[drone_id]]))
                else:
                    if cur_mode in [AVOID, HUNT, PACK]:
                        turn_action, turn_model = set_turn_action(
                            False, cur_speeds[drone_id],
                            np.array([turn_states[drone_id]]))

                    if cur_mode == AVOID:
                        speed_action = set_avoid_action(
                            True, turn_action,
                            np.array([avoid_states[drone_id]]))
                    else:
                        if cur_mode in [HUNT, PACK]:
                            speed_action = set_avoid_action(
                                False, turn_action,
                                np.array([avoid_states[drone_id]]))

                        if cur_mode == ACQUIRE:
                            acquire_action = set_acquire_action(
                                True, cur_speeds[drone_id],
                                np.array([acquire_states[drone_id, ]]))
                            turn_action = acquire_action
                        else:
                            acquire_action, acquire_model = set_acquire_action(
                                False, cur_speeds[drone_id],
                                np.array([acquire_states[drone_id, ]]))

                            if cur_mode == HUNT:
                                hunt_action, turn_action, speed_action = set_hunt_action(
                                    True, cur_speeds[drone_id], turn_action,
                                    speed_action, acquire_action,
                                    np.array([hunt_states[drone_id, ]]))
                            else:
                                hunt_action, turn_action, speed_action = set_hunt_action(
                                    False, cur_speeds[drone_id], turn_action,
                                    speed_action, acquire_action,
                                    np.array([hunt_states[drone_id, ]]))

                                if cur_mode == PACK and (
                                        total_frame_ctr == 1 or
                                    (replay_frame_ctr - 1) % PACK_EVAL_FRAMES
                                        == 0) and drone_id == 0:
                                    pack_action = set_pack_action(
                                        True, pack_state)
                                    # note: pack action only changed every PACK_EVAL_FRAMES.
                                    # for frames in between it's constant

            else:  # ...increasing use of predictions over time
                if cur_mode == TURN:
                    turn_action, turn_model = set_turn_action(
                        False, cur_speeds[drone_id],
                        np.array([turn_states[drone_id]]))
                else:
                    if cur_mode in [AVOID, HUNT, PACK]:
                        turn_action, turn_model = set_turn_action(
                            False, cur_speeds[drone_id],
                            np.array([turn_states[drone_id]]))

                    if cur_mode == AVOID:
                        speed_action = set_avoid_action(
                            False, turn_action,
                            np.array([avoid_states[drone_id]]))
                    else:
                        if cur_mode in [HUNT, PACK]:
                            speed_action = set_avoid_action(
                                False, turn_action,
                                np.array([avoid_states[drone_id]]))

                        if cur_mode == ACQUIRE:
                            acquire_action, acquire_model = set_acquire_action(
                                False, cur_speeds[drone_id],
                                np.array([acquire_states[drone_id, ]]))
                            turn_action = acquire_action
                        else:
                            acquire_action, acquire_model = set_acquire_action(
                                False, cur_speeds[drone_id],
                                np.array([acquire_states[drone_id, ]]))

                            if cur_mode == HUNT:
                                hunt_action, turn_action, speed_action = set_hunt_action(
                                    False, cur_speeds[drone_id], turn_action,
                                    speed_action, acquire_action,
                                    np.array([hunt_states[drone_id, ]]))
                            else:
                                hunt_action, turn_action, speed_action = set_hunt_action(
                                    False, cur_speeds[drone_id], turn_action,
                                    speed_action, acquire_action,
                                    np.array([hunt_states[drone_id, ]]))

                                if cur_mode == PACK and (
                                        total_frame_ctr == 1 or
                                    (replay_frame_ctr - 1) % PACK_EVAL_FRAMES
                                        == 0) and drone_id == 0:
                                    # get 1 pack action for each set of drones on first drone
                                    pack_action = set_pack_action(
                                        False, pack_state)
                                    print(pack_action)

            #print("++++++ pack action:", pack_action)
            #print(2)
            # pass action, receive new state, reward
            new_turn_state, new_avoid_state, new_acquire_state, new_hunt_state, new_drone_state, new_reward, new_speed = game_state.frame_step(
                drone_id, turn_action, speed_action, pack_action,
                cur_speeds[drone_id], total_frame_ctr, replay_frame_ctr)

            #print("********** 2. new states / rewards:")
            #print(total_frame_ctr)
            #print(drone_id)
            #print(new_drone_state)
            #print(new_reward)

            #print(3)
            # append (horizontally) historical states for learning speed.
            """ note: do this concatination even for models that are not learning (e.g., turn when running search or turn, search and acquire while running hunt) b/c their preds, performed above, expect the same multi-frame view that was in place when they trained."""

            if cur_mode in [TURN, AVOID, HUNT, PACK]:
                new_turn_state = state_frames(
                    new_turn_state, np.array([turn_states[drone_id]]),
                    TURN_TOTAL_SENSORS, TURN_STATE_FRAMES)

            if cur_mode in [AVOID, HUNT, PACK]:
                new_avoid_state = state_frames(
                    new_avoid_state, np.array([avoid_states[drone_id]]),
                    AVOID_TOTAL_SENSORS, AVOID_STATE_FRAMES)

            if cur_mode in [ACQUIRE, HUNT, PACK]:
                new_acquire_state = state_frames(
                    new_acquire_state, np.array([acquire_states[drone_id]]),
                    ACQUIRE_NUM_SENSOR, ACQUIRE_STATE_FRAMES)

            if cur_mode in [HUNT, PACK]:
                new_hunt_state = state_frames(
                    new_hunt_state, np.array([hunt_states[drone_id]]),
                    HUNT_TOTAL_SENSORS, HUNT_STATE_FRAMES)

            #print(4)
            if cur_mode == PACK and (total_frame_ctr == 1 or
                                     replay_frame_ctr % PACK_EVAL_FRAMES == 0):
                if drone_id == 0:  # for 1st drone, pack state = drone state
                    new_pack_state = new_drone_state
                    pack_rwd = new_reward

                else:  # otherwise, append drone record to prior drone state
                    new_pack_state = state_frames(new_pack_state,
                                                  new_drone_state,
                                                  DRONE_TOTAL_SENSOR, 2)
                    pack_rwd += new_reward

                new_drone_state = state_frames(
                    new_drone_state, np.array([drone_states[drone_id]]),
                    DRONE_TOTAL_SENSOR, PACK_STATE_FRAMES)

                if drone_id == (NUM_DRONES -
                                1):  # for last drone build pack record
                    if total_frame_ctr == 1:
                        pack_state = np.zeros(
                            (1, PACK_TOTAL_SENSORS * PACK_STATE_FRAMES))

                    new_pack_state = state_frames(
                        new_pack_state, pack_state, PACK_TOTAL_SENSORS,
                        PACK_STATE_FRAMES
                    )  #may need to add 1 to PACK_STATE_FRAMES

                    #print("**** 3. final pack reward:")
                    #print(pack_rwd)

            #print(5)
            # experience replay storage
            """note: only the model being trained requires event storage as it is stack that will be sampled for training below."""
            if cur_mode == TURN:
                replay.append((np.array([turn_states[drone_id]]), turn_action,
                               new_reward, new_turn_state))

            elif cur_mode == AVOID:
                replay.append((np.array([avoid_states[drone_id]]),
                               speed_action, new_reward, new_avoid_state))

            elif cur_mode == ACQUIRE:
                replay.append((np.array([acquire_states[drone_id]]),
                               turn_action, new_reward, new_acquire_state))

            elif cur_mode == HUNT:
                replay.append((np.array([hunt_states[drone_id]]), hunt_action,
                               new_reward, new_hunt_state))

            elif cur_mode == PACK and (total_frame_ctr == 1
                                       or replay_frame_ctr % PACK_EVAL_FRAMES
                                       == 0) and drone_id == (NUM_DRONES - 1):
                replay.append(
                    (pack_state, pack_action, pack_rwd, new_pack_state))
                #print(replay[-1])

            #print("6a")
            # If we're done observing, start training.
            if total_frame_ctr > observe and (
                    cur_mode != PACK or
                (replay_frame_ctr % PACK_EVAL_FRAMES == 0
                 and drone_id == (NUM_DRONES - 1))):

                # If we've stored enough in our buffer, pop the oldest.
                if len(replay) > buffer:
                    replay.pop(0)

                # Randomly sample our experience replay memory
                minibatch = random.sample(replay, batchSize)

                if cur_mode == TURN:
                    # Get training values.
                    X_train, y_train = process_minibatch(
                        minibatch, turn_model, TURN_NUM_INPUT, TURN_NUM_OUTPUT)
                    history = LossHistory()
                    turn_model.fit(X_train,
                                   y_train,
                                   batch_size=batchSize,
                                   nb_epoch=1,
                                   verbose=0,
                                   callbacks=[history])

                elif cur_mode == AVOID:
                    X_train, y_train = process_minibatch(
                        minibatch, avoid_model, AVOID_NUM_INPUT,
                        AVOID_NUM_OUTPUT)
                    history = LossHistory()
                    avoid_model.fit(X_train,
                                    y_train,
                                    batch_size=batchSize,
                                    nb_epoch=1,
                                    verbose=0,
                                    callbacks=[history])

                elif cur_mode == ACQUIRE:
                    X_train, y_train = process_minibatch(
                        minibatch, acquire_model, ACQUIRE_NUM_INPUT,
                        ACQUIRE_NUM_OUTPUT)
                    history = LossHistory()
                    acquire_model.fit(X_train,
                                      y_train,
                                      batch_size=batchSize,
                                      nb_epoch=1,
                                      verbose=0,
                                      callbacks=[history])

                elif cur_mode == HUNT:
                    X_train, y_train = process_minibatch(
                        minibatch, hunt_model, HUNT_NUM_INPUT, HUNT_NUM_OUTPUT)
                    history = LossHistory()
                    hunt_model.fit(X_train,
                                   y_train,
                                   batch_size=batchSize,
                                   nb_epoch=1,
                                   verbose=0,
                                   callbacks=[history])

                elif cur_mode == PACK:
                    X_train, y_train = process_minibatch(
                        minibatch, pack_model, PACK_NUM_INPUT, PACK_NUM_OUTPUT)
                    history = LossHistory()
                    pack_model.fit(X_train,
                                   y_train,
                                   batch_size=batchSize,
                                   nb_epoch=1,
                                   verbose=0,
                                   callbacks=[history])

                loss_log.append(history.losses)

            # Update the starting state with S'.
            if cur_mode in [TURN, AVOID, HUNT, PACK]:
                turn_states[drone_id] = new_turn_state

            if cur_mode in [AVOID, HUNT, PACK]:
                avoid_states[drone_id] = new_avoid_state

            if cur_mode in [ACQUIRE, HUNT, PACK]:
                acquire_states[drone_id] = new_acquire_state

            if cur_mode in [HUNT, PACK]:
                hunt_states[drone_id] = new_hunt_state

            if cur_mode == PACK and (total_frame_ctr == 1 or
                                     replay_frame_ctr % PACK_EVAL_FRAMES == 0):
                drone_states[drone_id] = new_drone_state

                if drone_id == (NUM_DRONES - 1):
                    pack_state = new_pack_state
                    replay_frame_ctr = 0

            cur_speeds[drone_id] = new_speed
            cum_rwd += new_reward

            # in case of crash, report and initialize
            if new_reward == -500 or new_reward == -1000:
                # Log the car's distance at this T.
                data_collect.append([total_frame_ctr, crash_frame_ctr])

                # Update max.
                if crash_frame_ctr > max_crash_frame_ctr:
                    max_crash_frame_ctr = crash_frame_ctr

                # Time it.
                tot_time = timeit.default_timer() - start_time
                fps = crash_frame_ctr / tot_time

                # Output some stuff so we can watch.
                #try:
                print(
                    "Max: %d at %d\t eps: %f\t dist: %d\t mode: %d\t cum rwd: %d\t fps: %d"
                    % (max_crash_frame_ctr, total_frame_ctr, epsilon,
                       crash_frame_ctr, cur_mode, cum_rwd, int(fps)))
                #    break
                #except (RuntimeError, TypeError, NameError):
                #    pass

                # Reset.
                crash_frame_ctr = cum_rwd = cum_speed = 0
                start_time = timeit.default_timer()

        #print(9)
        # decrement epsilon for another frame
        if epsilon > 0.1 and total_frame_ctr > observe:
            epsilon -= (1 / train_frames)

        if total_frame_ctr % 10000 == 0:
            if crash_frame_ctr != 0:
                #try:
                print(
                    "Max: %d at %d\t eps: %f\t dist: %d\t mode: %d\t cum rwd: %d"
                    % (max_crash_frame_ctr, total_frame_ctr, epsilon,
                       crash_frame_ctr, cur_mode, cum_rwd))
                #    break
                #except (RuntimeError, TypeError, NameError):
                #pass

        # Save model every 50k frames
        if total_frame_ctr % 50000 == 0:
            save_init = False
            if cur_mode == TURN:
                turn_model.save_weights('models/turn/turn-' + filename + '-' +
                                        str(START_SPEED) + '-' +
                                        str(total_frame_ctr) + '.h5',
                                        overwrite=True)
                print("Saving turn_model %s - %d - %d" %
                      (filename, START_SPEED, total_frame_ctr))

            elif cur_mode == AVOID:
                avoid_model.save_weights('models/avoid/avoid-' + filename +
                                         '-' + str(total_frame_ctr) + '.h5',
                                         overwrite=True)
                print("Saving avoid_model %s - %d" %
                      (filename, total_frame_ctr))

            elif cur_mode == ACQUIRE:
                acquire_model.save_weights('models/acquire/acquire-' +
                                           filename + '-' + str(START_SPEED) +
                                           '-' + str(total_frame_ctr) + '.h5',
                                           overwrite=True)
                print("Saving acquire_model %s - %d" %
                      (filename, total_frame_ctr))

            elif cur_mode == HUNT:
                hunt_model.save_weights('models/hunt/hunt-' + filename + '-' +
                                        str(total_frame_ctr) + '.h5',
                                        overwrite=True)
                print("Saving hunt_model %s - %d" %
                      (filename, total_frame_ctr))

            elif cur_mode == PACK:
                pack_model.save_weights('models/pack/pack-' + filename + '-' +
                                        str(total_frame_ctr) + '.h5',
                                        overwrite=True)
                print("Saving pack_model %s - %d" %
                      (filename, total_frame_ctr))

    # Log results after we're done all frames.
    log_results(filename, data_collect, loss_log)
Example #17
0
def train_net(model, params):

    filename = params_to_filename(params)
    observe = 129  # Number of frames to observe before training.
    epsilon = 0.5
    train_frames = 50000  # Number of frames to play.
    steps = 0
    batchSize = params['batchSize']
    buffer = params['buffer']

    # Just stuff used below.
    max_car_distance = 0
    car_distance = 0
    t = 0
    data_collect = []
    replay = []  # stores tuples of (S, A, R, S'). #to be displayed

    loss_log = []

    # Create a new game instance.
    game_state = carmunk.GameState()

    # Get initial state by doing nothing and getting the state.
    _, state, _ = game_state.frame_step((2))

    # Let's time it.
    start_time = timeit.default_timer()

    # Run the frames.

    while t < train_frames:
        print(t)
        t += 1
        car_distance += 1

        # Choose an action.
        if random.random() < epsilon or t < observe:
            action = np.random.randint(0, 5)  # random
        else:
            # Get Q values for each action.
            print("PREDICTED", state)
            # time.sleep(1)
            x = state[0]
            y = state[1]
            qval = model.predict(np.array([x, y]).reshape((1, 2)),
                                 batch_size=1)
            action = (np.argmax(qval))  # best

        # Take action, observe new state and get our treat.
        reward, new_state, term = game_state.frame_step(action)
        print("timestep :" + str(t) + "Reward" + str(reward) + "action" +
              str(action) + "state" + str(state))
        # Experience replay storage.
        replay.append((state, action, reward, new_state))
        # print(len(replay))
        # If we're done observing, start training.
        if t > observe:
            #print("start")
            # If we've stored enough in our buffer, pop the oldest.
            if len(replay) > buffer:
                replay.pop(0)

            # Randomly sample our experience replay memory
            minibatch = random.sample(replay, batchSize)

            # Get training values.
            X_train, y_train = process_minibatch2(minibatch, model)
            # Train the model on this batch.
            history = LossHistory()
            model.fit(X_train,
                      y_train,
                      batch_size=batchSize,
                      verbose=0,
                      callbacks=[history])
            loss_log.append(history.losses)
            steps += 1
            if steps % 1000 == 0:
                print("Step = " + str(steps), "Epsilon = " + str(epsilon))
        # Update the starting state with S'.
        state = new_state

        # Decrement epsilon over time.
        if epsilon > 0.1 and t > observe:
            epsilon -= (10.0 / train_frames)
            print("EPSILON UPDATED", epsilon)

        # We died, so update stuff.
        if term == 1:
            # print("Crashed.")
            # Log the car's distance at this T.
            data_collect.append([t, car_distance])
            continue
            # Reset.
            car_distance = 0
        # We reached the goal, so update stuff.
        elif term == 2:
            print("Reached goal.", car_distance)
            # Log the car's distance at this T.
            data_collect.append([t, car_distance])
            continue
            # Reset.
            car_distance = 0

        # Save the model every 25,000 frames.
        if t % 25000 == 0:
            model.save_weights('saved-models/' + filename + '-' + str(t) +
                               '.h5',
                               overwrite=True)
            print("Saving model %s - %d" % (filename, t))
        # if(keyboard.is_pressed('8')):
        #     print("Reset Goal")
        #     game_state.reset_goal()
        print(t, reward, action)
Example #18
0
def train_net(best_action_model, params):

    filename = params_to_filename(params)

    observe = 1000  # Number of frames to observe before training.
    epsilon = 1
    train_frames = 500000  # Number of frames to play. was 1000000
    batchSize = params['batchSize']
    buffer = params['buffer']

    # Just stuff used below.
    max_car_distance = 0
    car_distance = 0
    t = 0
    cum_rwd = 0
    cum_rwd_read = 0
    cum_rwd_dist = 0
    cum_rwd_speed = 0

    data_collect = []
    replay = []  # stores tuples of (S, A, R, S').
    save_init = True
    loss_log = []

    # Create a new game instance.
    game_state = carmunk.GameState()

    # Get initial state by doing nothing and getting the state.
    state, new_reward, cur_speed, _, _, _ = game_state.frame_step(
        START_ACTION, START_SPEED, START_DISTANCE)

    # frame_step returns reward, state, speed
    #state = state_frames(state, np.array([[0, 0, 0, 0, 0, 0, 0]])) # zeroing distance readings
    #state = state_frames(state, np.zeros((1,NUM_SENSORS))) # zeroing distance readings

    # Let's time it.
    start_time = timeit.default_timer()

    # Run the frames.
    while t < train_frames:

        #time.sleep(0.5)

        t += 1
        car_distance += 1

        # Choose an action.
        if random.random() < epsilon or t < observe:
            action = np.random.randint(0, NUM_OUTPUT)  # random
        else:
            # Get Q values for each action
            qval = best_action_model.predict(state, batch_size=1)
            # best_action_model was passed to this function. call it w/ current state
            action = (np.argmax(qval))  # best prediction

        # Take action, observe new state and get our treat.
        new_state, new_reward, new_speed, new_rwd_read, new_rwd_dist, new_rwd_speed = \
            game_state.frame_step(action, cur_speed, car_distance)

        # Use multiple frames.
        #new_state = state_frames(new_state, state) # seems this is appending 2-3 moves, results

        # Experience replay storage.
        replay.append((state, action, new_reward, new_state))

        # If we're done observing, start training.
        if t > observe:

            # If we've stored enough in our buffer, pop the oldest.
            if len(replay) > buffer:
                replay.pop(0)

            # Randomly sample our experience replay memory
            minibatch = random.sample(replay, batchSize)
            # WHY RANDOM SAMPLE? COULD TRAINING BE SPED UP BY TAKING LAST BATCHSIZE

            # Get training values.
            X_train, y_train = process_minibatch(minibatch, best_action_model)

            # Train the best_action_model on this batch.
            history = LossHistory()
            best_action_model.fit(X_train,
                                  y_train,
                                  batch_size=batchSize,
                                  nb_epoch=1,
                                  verbose=0,
                                  callbacks=[history])
            loss_log.append(history.losses)

        # Update the starting state with S'.
        state = new_state
        cur_speed = new_speed
        cum_rwd += new_reward
        cum_rwd_read += new_rwd_read
        cum_rwd_dist += new_rwd_dist
        cum_rwd_speed += new_rwd_speed

        # Decrement epsilon over time.
        if epsilon > 0.1 and t > observe:
            epsilon -= (1 / train_frames)

        # We died, so update stuff.
        if new_reward == -500 or new_reward == -1000:
            # Log the car's distance at this T.
            data_collect.append([t, car_distance])

            # Update max.
            if car_distance > max_car_distance:
                max_car_distance = car_distance

            # Time it.
            tot_time = timeit.default_timer() - start_time
            fps = car_distance / tot_time

            # Output some stuff so we can watch.
            print("Max: %d at %d\t eps: %f\t dist: %d\t rwd: %d\t read: %d\t dist: %d\t speed: %d\t fps: %d" %
                  (max_car_distance, t, epsilon, car_distance, cum_rwd, \
                   cum_rwd_read, cum_rwd_dist, cum_rwd_speed, int(fps)))

            # Reset.
            car_distance = 0
            cum_rwd = 0
            cum_rwd_read = 0
            cum_rwd_dist = 0
            cum_rwd_speed = 0
            start_time = timeit.default_timer()

        # Save early best_action_model, then every 20,000 frames
        if t % 50000 == 0:
            save_init = False
            best_action_model.save_weights('saved-best_action_models/' +
                                           filename + '-' + str(t) + '.h5',
                                           overwrite=True)
            print("Saving best_action_model %s - %d" % (filename, t))

    # Log results after we're done all frames.
    log_results(filename, data_collect, loss_log)
Example #19
0
def train_net(model, params, weights, path, trainFrames, i=10):

    filename = params_to_filename(params)

    observe = 1000  # Number of frames to observe before training.
    epsilon = 1
    train_frames = trainFrames  # Number of frames to play.
    batchSize = params['batchSize']
    buffer = params['buffer']

    # Just stuff used below.
    max_car_distance = 0
    car_distance = 0
    t = 0
    data_collect = []
    replay = []  # stores tuples of (S, A, R, S').

    loss_log = []

    # Create a new game instance.
    game_state = carmunk.GameState(weights, [1, 0, 0, 0])

    # Get initial state by doing nothing and getting the state.
    _, state, temp1 = game_state.frame_step((2))

    # Let's time it.
    start_time = timeit.default_timer()

    # Run the frames.
    while t < train_frames:

        t += 1
        car_distance += 1

        # Choose an action.
        if random.random() < epsilon or t < observe:
            action = np.random.randint(0, 3)  # random #3
        else:
            # Get Q values for each action.
            qval = model.predict(state)
            action = (np.argmax(qval))  # best
            #print ("action under learner ", action)

        # Take action, observe new state and get our treat.
        reward, new_state, temp2 = game_state.frame_step(action)

        # Experience replay storage.
        replay.append((state, action, reward, new_state))

        # If we're done observing, start training.
        if t > observe:

            # If we've stored enough in our buffer, pop the oldest.
            if len(replay) > buffer:
                replay.pop(0)

            # Randomly sample our experience replay memory
            minibatch = random.sample(replay, batchSize)

            # Get training values.
            X_train, y_train = process_minibatch(minibatch, model)

            # Train the model on this batch.
            # history = LossHistory()
            actions, steploss = model.train(X_train, y_train)

            # loss_log.append(history.losses)

        # Update the starting state with S'.
        state = new_state

        # Decrement epsilon over time.
        if epsilon > 0.1 and t > observe:
            epsilon -= (1 / train_frames)

        # We died, so update stuff.
        if state[0][7] == 1:
            # Log the car's distance at this T.
            data_collect.append([t, car_distance])

            # Update max.
            if car_distance > max_car_distance:
                max_car_distance = car_distance

            # Time it.
            tot_time = timeit.default_timer() - start_time
            fps = car_distance / tot_time

            # Output some stuff so we can watch.
            #print("Max: %d at %d\tepsilon %f\t(%d)\t%f fps" %
            #(max_car_distance, t, epsilon, car_distance, fps))

            # Reset.
            car_distance = 0
            start_time = timeit.default_timer()

        # Save the model
        # print(t)
        if t % train_frames == 0:
            model.save_weights('saved-models_' + path + '/evaluatedPolicies/' +
                               str(i) + '-' + filename + '-' + str(t) + '.h5')
            print("Saving model %s - %d" % (filename, t))
Example #20
0
def play(turn_model, turn_model_30, turn_model_50, turn_model_70, avoid_model,
         acquire_model, acquire_model_30, acquire_model_50, acquire_model_70,
         hunt_model, pack_model, params):

    total_frame_ctr = 0
    crash_frame_ctr = 0
    replay_frame_ctr = 0
    crash_ctr = 0
    acquire_ctr = 0
    cum_speed = 0
    stop_ctr = avoid_ctr = acquire_ctr = 0
    cur_speeds = []
    for i in range(NUM_DRONES):
        cur_speeds.append(START_SPEED)

    # initialize drone state holders
    turn_states = np.zeros(
        [NUM_DRONES, TURN_TOTAL_SENSORS * TURN_STATE_FRAMES])
    avoid_states = np.zeros(
        [NUM_DRONES, AVOID_TOTAL_SENSORS * AVOID_STATE_FRAMES])
    acquire_states = np.zeros(
        [NUM_DRONES, ACQUIRE_NUM_SENSOR * ACQUIRE_STATE_FRAMES])
    hunt_states = np.zeros(
        [NUM_DRONES, HUNT_TOTAL_SENSORS * HUNT_STATE_FRAMES])
    drone_states = np.zeros(
        [NUM_DRONES, DRONE_TOTAL_SENSOR * PACK_STATE_FRAMES])

    # create game instance
    game_state = carmunk.GameState()

    # get initial state(s)
    turn_state, avoid_state, acquire_state, hunt_state, drone_state, reward, cur_speed = \
        game_state.frame_step(START_DRONE_ID, START_TURN_ACTION, START_SPEED_ACTION,
                              START_PACK_ACTION, START_SPEED, START_DISTANCE, 1)

    # initialize frame states
    if cur_mode in [TURN, AVOID, HUNT, PACK]:

        for i in range(NUM_DRONES):
            turn_states[i] = state_frames(
                turn_state,
                np.zeros((1, TURN_TOTAL_SENSORS * TURN_STATE_FRAMES)),
                TURN_TOTAL_SENSORS, TURN_STATE_FRAMES)

        if cur_mode in [AVOID, HUNT, PACK]:

            for i in range(NUM_DRONES):
                avoid_states[i] = state_frames(
                    avoid_state,
                    np.zeros((1, AVOID_TOTAL_SENSORS * AVOID_STATE_FRAMES)),
                    AVOID_TOTAL_SENSORS, AVOID_STATE_FRAMES)

    if cur_mode in [ACQUIRE, HUNT, PACK]:

        for i in range(NUM_DRONES):
            acquire_states[i] = state_frames(
                acquire_state,
                np.zeros((1, ACQUIRE_NUM_SENSOR * ACQUIRE_STATE_FRAMES)),
                ACQUIRE_NUM_SENSOR, ACQUIRE_STATE_FRAMES)

    if cur_mode in [HUNT, PACK]:

        for i in range(NUM_DRONES):
            hunt_states[i] = state_frames(
                hunt_state,
                np.zeros((1, HUNT_TOTAL_SENSORS * HUNT_STATE_FRAMES)),
                HUNT_TOTAL_SENSORS, HUNT_STATE_FRAMES)

    if cur_mode == PACK:

        for i in range(NUM_DRONES):
            drone_states[i] = state_frames(
                drone_state,
                np.zeros((1, DRONE_TOTAL_SENSOR * PACK_STATE_FRAMES)),
                DRONE_TOTAL_SENSOR, PACK_STATE_FRAMES)

        #pack_state = state_frames(drone_state,
        #                          np.zeros((1, PACK_TOTAL_SENSORS * PACK_STATE_FRAMES)),
        #                          PACK_TOTAL_SENSORS, PACK_STATE_FRAMES)

        pack_state = state_frames(drone_state, np.zeros((1, 30)), 10, 4)

    # Move.
    while True:

        total_frame_ctr += 1
        crash_frame_ctr += 1
        replay_frame_ctr += 1

        #time.sleep(1)

        for drone_id in range(
                NUM_DRONES):  # NUM_DRONES = 1, unless you're in PACK mode

            speed_action = START_SPEED_ACTION

            # choose action
            if cur_mode == TURN:
                turn_action, turn_model = set_turn_action(
                    False, cur_speeds[drone_id],
                    np.array([turn_states[drone_id]]))
            else:
                if cur_mode in [AVOID, HUNT, PACK]:
                    turn_action, turn_model = set_turn_action(
                        False, cur_speeds[drone_id],
                        np.array([turn_states[drone_id]]))

                if cur_mode == AVOID:
                    speed_action = set_avoid_action(
                        False, turn_action, np.array([avoid_states[drone_id]]))
                else:
                    if cur_mode in [HUNT, PACK]:
                        speed_action = set_avoid_action(
                            False, turn_action,
                            np.array([avoid_states[drone_id]]))

                    if cur_mode == ACQUIRE:
                        acquire_action, acquire_model = set_acquire_action(
                            False, cur_speeds[drone_id],
                            np.array([acquire_states[drone_id, ]]))
                        turn_action = acquire_action
                    else:
                        acquire_action, acquire_model = set_acquire_action(
                            False, cur_speeds[drone_id],
                            np.array([acquire_states[drone_id, ]]))

                        if cur_mode == HUNT:
                            hunt_action, turn_action, speed_action = set_hunt_action(
                                False, cur_speeds[drone_id], turn_action,
                                speed_action, acquire_action,
                                np.array([hunt_states[drone_id, ]]))
                        else:
                            hunt_action, turn_action, speed_action = set_hunt_action(
                                False, cur_speeds[drone_id], turn_action,
                                speed_action, acquire_action,
                                np.array([hunt_states[drone_id, ]]))

                            if cur_mode == PACK and (
                                    total_frame_ctr == 1 or replay_frame_ctr %
                                    PACK_EVAL_FRAMES == 0) and drone_id == 0:
                                # get 1 pack action for each set of drones on first drone
                                pack_action = set_pack_action(
                                    False, pack_state)

            # pass action, receive new state, reward
            new_turn_state, new_avoid_state, new_acquire_state, new_hunt_state, new_drone_state, new_reward, new_speed = game_state.frame_step(
                drone_id, turn_action, speed_action, pack_action,
                cur_speeds[drone_id], total_frame_ctr, replay_frame_ctr)

            # append (horizontally) historical states for learning speed.
            if cur_mode in [TURN, AVOID, HUNT, PACK]:
                new_turn_state = state_frames(
                    new_turn_state, np.array([turn_states[drone_id]]),
                    TURN_TOTAL_SENSORS, TURN_STATE_FRAMES)

            if cur_mode in [AVOID, HUNT, PACK]:
                new_avoid_state = state_frames(
                    new_avoid_state, np.array([avoid_states[drone_id]]),
                    AVOID_TOTAL_SENSORS, AVOID_STATE_FRAMES)

            if cur_mode in [ACQUIRE, HUNT, PACK]:
                new_acquire_state = state_frames(
                    new_acquire_state, np.array([acquire_states[drone_id]]),
                    ACQUIRE_NUM_SENSOR, ACQUIRE_STATE_FRAMES)

            if cur_mode in [HUNT, PACK]:
                new_hunt_state = state_frames(
                    new_hunt_state, np.array([hunt_states[drone_id]]),
                    HUNT_TOTAL_SENSORS, HUNT_STATE_FRAMES)

            if cur_mode == PACK and (total_frame_ctr == 1 or
                                     replay_frame_ctr % PACK_EVAL_FRAMES == 0):
                if drone_id == 0:  # for 1st drone, pack state = drone state
                    new_pack_state = new_drone_state
                    pack_rwd = new_reward

                else:  # otherwise, append drone record to prior drone state
                    new_pack_state = state_frames(new_pack_state,
                                                  new_drone_state,
                                                  DRONE_TOTAL_SENSOR,
                                                  PACK_STATE_FRAMES - 1)
                    pack_rwd += new_reward

                new_drone_state = state_frames(
                    new_drone_state, np.array([drone_states[drone_id]]),
                    DRONE_TOTAL_SENSOR, PACK_STATE_FRAMES)

                if drone_id == (NUM_DRONES -
                                1):  # for last drone build pack record
                    if total_frame_ctr == 1:
                        pack_state = np.zeros(
                            (1, PACK_TOTAL_SENSORS * PACK_STATE_FRAMES))

                    new_pack_state = state_frames(
                        new_pack_state, pack_state, PACK_TOTAL_SENSORS,
                        PACK_STATE_FRAMES
                    )  #may need to add 1 to PACK_STATE_FRAMES

            # Update the starting state with S'.
            if cur_mode in [TURN, AVOID, HUNT, PACK]:
                turn_states[drone_id] = new_turn_state

            if cur_mode in [AVOID, HUNT, PACK]:
                avoid_states[drone_id] = new_avoid_state

            if cur_mode in [ACQUIRE, HUNT, PACK]:
                acquire_states[drone_id] = new_acquire_state

            if cur_mode in [HUNT, PACK]:
                hunt_states[drone_id] = new_hunt_state

            if cur_mode == PACK and (total_frame_ctr == 1 or
                                     replay_frame_ctr % PACK_EVAL_FRAMES == 0):
                drone_states[drone_id] = new_drone_state

                if drone_id == (NUM_DRONES - 1):
                    pack_state = new_pack_state
                    replay_frame_ctr = 0

            cur_speeds[drone_id] = new_speed

        # give status
        if new_reward == -500 or new_reward == -1000:
            crash_ctr += 1
            print("crashes", crash_ctr, "frames", total_frame_ctr)
        elif new_reward == 1000:
            acquire_ctr += 1
            print("acquisitions:", acquire_ctr, "frames", total_frame_ctr)

        if total_frame_ctr % 5000 == 0:
            print("***** total frames:", total_frame_ctr)
            print("***** frames between crashes:",
                  int(total_frame_ctr / crash_ctr))

            if cur_mode in [ACQUIRE, HUNT, PACK]:

                print("***** frames / acquisition:",
                      int(total_frame_ctr / acquire_ctr))
Example #21
0
def trainNetwork(model, args):
    filename = 'rl'
    data_collect = []
    loss_log = []
    car_distance = 0
    r_t_sum = 0

    # open up a game state to communicate with emulator
    game_state = carmunk.GameState()

    # store the previous observations in replay memory
    D = deque()

    # get the first state by doing nothing and preprocess the image to 80x80x4
    do_nothing = np.zeros(ACTIONS)
    do_nothing[0] = 1
    x_t, r_0, terminal = game_state.frame_step(do_nothing)

    x_t = skimage.color.rgb2gray(x_t)
    x_t = skimage.transform.resize(x_t, (80, 80))
    x_t = skimage.exposure.rescale_intensity(x_t, out_range=(0, 255))

    s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)
    #print (s_t.shape)

    #In Keras, need to reshape
    s_t = s_t.reshape(1, s_t.shape[0], s_t.shape[1], s_t.shape[2])  #1*80*80*4

    if args['mode'] == 'Run':
        OBSERVE = 999999999  #We keep observe, never train
        epsilon = FINAL_EPSILON
        print("Now we load weight")
        model.load_weights("model.h5")
        adam = Adam(lr=LEARNING_RATE)
        model.compile(loss='mse', optimizer=adam)
        print("Weight load successfully")
    else:  #We go to training mode
        OBSERVE = OBSERVATION
        epsilon = INITIAL_EPSILON

    begin = strftime("%a, %d %b %Y %H:%M:%S", gmtime())
    t = 0
    while t < TOTAL_FRAMES:
        loss = 0
        Q_sa = 0
        action_index = 0
        r_t = 0
        a_t = np.zeros([ACTIONS])
        car_distance += 1

        #choose an action epsilon greedy
        if t % FRAME_PER_ACTION == 0:
            if random.random() <= epsilon:
                #print("----------Random Action----------")
                action_index = random.randrange(ACTIONS)
                a_t[action_index] = 1
            else:
                q = model.predict(
                    s_t)  #input a stack of 4 images, get the prediction
                max_Q = np.argmax(q)
                action_index = max_Q
                a_t[max_Q] = 1

        #We reduced the epsilon gradually
        if epsilon > FINAL_EPSILON and t > OBSERVE:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

        #run the selected action and observed next state and reward
        x_t1_colored, r_t, terminal = game_state.frame_step(a_t)

        r_t_sum += r_t

        x_t1 = skimage.color.rgb2gray(x_t1_colored)
        x_t1 = skimage.transform.resize(x_t1, (80, 80), mode='reflect')
        x_t1 = skimage.exposure.rescale_intensity(x_t1, out_range=(0, 255))

        x_t1 = x_t1.reshape(1, x_t1.shape[0], x_t1.shape[1], 1)  #1x80x80x1
        s_t1 = np.append(x_t1, s_t[:, :, :, :3], axis=3)

        if terminal:
            # Log the car's distance to get the GOAL.
            data_collect.append([t, car_distance, r_t_sum])
            car_distance = 0
            r_t_sum = 0

        # store the transition in D
        D.append((s_t, action_index, r_t, s_t1, terminal))
        if len(D) > REPLAY_MEMORY:
            D.popleft()

        #only train if done observing
        if t > OBSERVE:
            #sample a minibatch to train on
            minibatch = random.sample(D, BATCH)

            inputs = np.zeros((BATCH, s_t.shape[1], s_t.shape[2],
                               s_t.shape[3]))  #32, 80, 80, 4
            #print (inputs.shape)
            targets = np.zeros((inputs.shape[0], ACTIONS))  #32, 2

            #Now we do the experience replay
            for i in range(0, len(minibatch)):
                state_t = minibatch[i][0]
                action_t = minibatch[i][1]  #This is action index
                reward_t = minibatch[i][2]
                state_t1 = minibatch[i][3]
                terminal = minibatch[i][4]
                # if terminated, only equals reward

                inputs[i:i + 1] = state_t  #I saved down s_t

                targets[i] = model.predict(
                    state_t)  # Hitting each buttom probability
                Q_sa = model.predict(state_t1)

                if terminal:
                    targets[i, action_t] = reward_t
                else:
                    targets[i, action_t] = reward_t + GAMMA * np.max(Q_sa)

            # targets2 = normalize(targets)
            loss += model.train_on_batch(inputs, targets)
            #loss_log.append(model.train_on_batch(inputs, targets))

        s_t = s_t1
        t = t + 1

        # save progress every 1000 iterations
        if t % 1000 == 0:
            #print("Now we save model")
            model.save_weights("model.h5", overwrite=True)
            with open("model.json", "w") as outfile:
                json.dump(model.to_json(), outfile)

            # Log results after we're done all frames.
            log_results(filename, data_collect, loss_log)

        # print info
        state = ""
        if t <= OBSERVE:
            state = "Observe / Running"
        elif t > OBSERVE:
            state = "Training"
        else:
            state = "Execution"

        if t % 10000 == 0:
            print("TIMESTEP", t, "/ STATE", state, \
                "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, \
                "/ Q_MAX " , np.max(Q_sa), "/ Loss ", loss, "/ Reward_sum ", r_t_sum, "/ car_distance ", car_distance)

    print("Episode finished!")
    print("************************")
    end = strftime("%a, %d %b %Y %H:%M:%S", gmtime())

    print("=============== Total training time =====================")
    print(begin)
    print(end)
    print("=========================================================")

    # Log results after we're done all frames.
    log_results(filename, data_collect, loss_log)