Exemple #1
0
def train_models(X_train, y_train, batchSize, model, loss_log):
    history = LossHistory()
    model.fit(X_train,
              y_train,
              batch_size=batchSize,
              nb_epoch=1,
              verbose=0,
              callbacks=[history])
    loss_log.append(history.losses)

    return loss_log
Exemple #2
0
def train_net(model, params):

    filename = params_to_filename(params)

    observe = 1000  # Number of frames to observe before training.
    epsilon = 1
    train_frames = 100000  # Number of frames to play.
    batchSize = params['batchSize']
    buffer = params['buffer']

    # Just stuff used below.
    max_car_distance = 0
    car_distance = 0
    min_distance = 10000
    t = 0
    data_collect = []
    replay = []  # stores tuples of (S, A, R, S').

    loss_log = []

    # Create a new game instance.
    game_state = carmunk.GameState()

    # Get initial state by doing nothing and getting the state.
    _, state,_= game_state.frame_step((2))

    # Let's time it.
    start_time = timeit.default_timer()

    # Run the frames.
    while t < train_frames:

        t += 1
        car_distance += 1

        # Choose an action.
        # if random.random() < epsilon or t < observe:
        #     action = np.random.randint(0, 3)  # random
        # else:
            # Get Q values for each action.
        qval = model.predict(state, batch_size=1)
        action = (np.argmax(qval))  # best

        # Take action, observe new state and get our treat.
        reward, new_state, distance = game_state.frame_step(action)

        # Experience replay storage.
        replay.append((state, action, reward, new_state))

        # If we're done observing, start training.
        if t > observe:

            # If we've stored enough in our buffer, pop the oldest.
            if len(replay) > buffer:
                replay.pop(0)

            # Randomly sample our experience replay memory
            minibatch = random.sample(replay, batchSize)   # mot liss gom 64 tuble, moi tuble gom 4 phan tu (S, A, R, S')

            # Get training values.
            X_train, y_train = process_minibatch2(minibatch, model)

            # Train the model on this batch.
            history = LossHistory()
            model.fit(
                X_train, y_train, batch_size=batchSize,
                nb_epoch=1, verbose=0, callbacks=[history]
            )
            loss_log.append(history.losses)

        # Update the starting state with S'.
        state = new_state

        # Decrement epsilon over time.
        if epsilon > 0.1 and t > observe:
            epsilon -= (1.0/train_frames)

        if distance < min_distance:
            min_distance = distance

        # We died, so update stuff.
        if reward == -500:
            # Log the car's distance at this T.
            data_collect.append([t, car_distance])

            # Update max.
            if car_distance > max_car_distance:
                max_car_distance = car_distance

            # Time it.
            tot_time = timeit.default_timer() - start_time
            fps = car_distance / tot_time

            # Output some stuff so we can watch.
            # print("Max_car_distance: %d at %d\tepsilon %f\t(%d)\tdistance %d\t%f fps" %
            #       (max_car_distance, t, epsilon, car_distance, distance, fps))

            # Reset.
            car_distance = 0
            start_time = timeit.default_timer()
        if t % 10 == 0:
            print("Max_car_distance: %d at %d\tepsilon %f\t(%d)\tdistance %d \tmin_distance %d" %
                 (max_car_distance, t, epsilon, car_distance, distance, min_distance))

        # Save the model every 25,000 frames.
        if t % 10000 == 0:
            model.save_weights('saved-models/' + filename + '-' +
                               str(t) + '.h5',
                               overwrite=True)
            print("Saving model %s - %d" % (filename, t))
        
    # Log results after we're done all frames.
    log_results(filename, data_collect, loss_log)
Exemple #3
0
  reward = np.dot(train,weightReadings)
  reward = reward.astype(int)
  
  if trainCount > observe:
    # If we've stored enough in our buffer, pop the oldest.
    if len(replay) > buffer:
      replay.pop(0)
    
	# Randomly sample our experience replay memory
    minibatch = random.sample(replay, batchSize)
    
    # Get training values by Sarsa 0
    X_train, y_train = tv.sarsa0_minibatch(minibatch, model, sarsa0P)

    # Train the model on this batch.
    history = LossHistory()
    model.fit(
      X_train, y_train, batch_size=batchSize,
      nb_epoch=1, verbose=0, callbacks=[history]
    )
    loss_log.append(history.losses)

	
  state = new_state
  
  if epsilon > final_epsilon and trainCount > observe:
    epsilon -= (1/train_frames)
  print (epsilon)
  
  # Save the model every 25,000 frames.
  filename = 'train5'
Exemple #4
0
    def update_replay(self, reward, new_state, action=None):
        if action is None:
            action = self.lastAction

        # Experience replay storage.
        self.replay.append(
            (np.copy(self.old_state), action, reward, np.copy(new_state)))

        # If we're done observing, start training.
        if self.t > self.observe:
            # If we've stored enough in our buffer, pop the oldest.
            if len(self.replay) > self.buffer:
                self.replay.pop(0)

            # Randomly sample our experience replay memory
            minibatch = random.sample(self.replay, self.batchSize)

            # Get training values.
            X_train, y_train = process_minibatch2(minibatch, self.model,
                                                  self.sequence_length,
                                                  self.end_value, self.GAMMA)

            # Train the model on this batch.
            history = LossHistory()
            self.model.fit(X_train,
                           y_train,
                           batch_size=self.batchSize,
                           nb_epoch=1,
                           verbose=0,
                           callbacks=[history])
            self.loss_log.append(history.losses)

            if self.t % self.save_every == 0:
                if len(self.data_collect) > 50:
                    # Save the results to a file so we can graph it later.
                    learn_f = 'results/command-frames/learn_data-' + self.filename + '.csv'
                    with open(learn_f, 'w', newline='') as data_dump:
                        wr = csv.writer(data_dump)
                        wr.writerows(self.data_collect)
                    plotting.plot_file(learn_f, 'learn')

                if len(self.loss_log) > 500:
                    loss_f = 'results/command-frames/loss_data-' + self.filename + '.csv'
                    with open(loss_f, 'w', newline='') as lf:
                        wr = csv.writer(lf)
                        for loss_item in self.loss_log:
                            wr.writerow(loss_item)

                    plotting.plot_file(loss_f, 'loss')

        # Update the starting state with S'.
        self.state = new_state

        # Decrement epsilon over time.
        if self.epsilon > 0.1 and self.t > self.observe:
            self.epsilon -= (1.0 / self.train_frames)

        # We died, so update stuff.
        if reward == -500:
            # Log the car's distance at this T.
            print([self.t, self.hacker_cmds])
            self.data_collect.append([self.t, self.hacker_cmds])

            # Update max.
            if self.hacker_cmds > self.max_hacker_cmds:
                self.max_hacker_cmds = self.hacker_cmds

            # Time it.
            tot_time = timeit.default_timer() - self.start_time
            fps = self.hacker_cmds / tot_time

            # Output some stuff so we can watch.
            print("Max: %d at %d\tepsilon %f\t(%d)\t%f fps" %
                  (self.max_hacker_cmds, self.t, self.epsilon,
                   self.hacker_cmds, fps))

            # Reset.
            self.hacker_cmds = 0
            start_time = timeit.default_timer()

        # Save the model every 25,000 frames.
        if self.t % self.save_every == 0:
            pickle._dump(
                self.replay,
                open(self.save_replay_file_prefix + "-" + str(self.t), "wb"))
            model_save_filename = self.save_model_file_prefix + self.filename + '-' + str(
                self.t) + '.h5'
            self.model.save_weights(model_save_filename, overwrite=True)
            print("Saving model %s - %d" % (self.filename, self.t))
def train(model, params):
    filename = params_to_filename(params)

    EPISODE = 10
    FRAMES = 4000
    OBSERVE = FRAMES * 3
    epsilon = 1
    batchSize = params['batchSize']
    buffer = params['buffer']
    replay = []
    minibatch = []
    total_frames = 0
    path_log = []
    loss_log = []

    # min_path_length = 0

    for m in range(EPISODE):
        print("Episode: %d" % (m))
        gameObject = GameClass(draw_screen=True, display_path=True, fps=FPS)

        # Choose no action in the initial frame
        action = 2
        reward, state = gameObject.frame_step(action)
        for t in range(FRAMES):
            total_frames += 1

            if t % (FRAMES / 10) == 0:
                print("Frames: %d" % (t))

            # Choose the action based on the epsilon greedy algorithm
            if (random.random() < epsilon
                    or total_frames < OBSERVE):  # choose random action
                action = np.random.randint(0, 3)
            else:  # choose best action from Q(s,a) values
                # Let's run our Q function on (state,action) to get Q values for all possible actions
                Q = np.zeros(3)
                for a in range(3):
                    features = get_features(state, a)
                    Q[a] = model.predict(features, batch_size=batchSize)
                    action = (np.argmax(Q))

            # Execute the action, observe new state and reward
            reward, state_new = gameObject.frame_step(action)
            path_length = gameObject.num_steps

            # Store the (state, action, reward, new state) pair in the replay
            memory = state, action, reward, state_new
            replay.append(memory)

            # If we've stored enough in our buffer, pop the oldest.
            if len(replay) > buffer:
                replay.pop(0)

            # Randomly sample our experience replay memory if we have enough samples
            if total_frames > OBSERVE:
                minibatch = random.sample(replay, batchSize)

                # Process the minibatch to get the training data
                X_train, y_train = process_minibatch(minibatch, model,
                                                     batchSize)

                # Train the model on this batch.
                history = LossHistory()
                model.fit(X_train,
                          y_train,
                          batch_size=batchSize,
                          verbose=0,
                          callbacks=[history])
                loss_log.append(history.losses)

                # Decrement epsilon over time.
                if epsilon > 0.1:
                    epsilon -= 1.0 / (FRAMES * EPISODE - OBSERVE)

            # Update the starting state with S'.
            state = state_new

            # Stop this episode if we achieved the goal
            if gameObject.check_reach_goal():
                # Log the robot's path length
                path_log.append([m, path_length])

                # # Update the min
                # if path_length < min_path_length:
                #     min_path_length = path_length

                # # Output some stuff so we can watch.
                # print("Min: %d \t epsilon %f\t(%d)" %
                #   (min_path_length, epsilon, path_length))

                # Stop this episode
                break

        # Save the model every episode after observation.
        if total_frames > OBSERVE:
            model.save('saved-models/model_nn-' + filename + '-' + str(m) +
                       '.h5',
                       overwrite=True)
            print("Saving model %s - %d" % (filename, m))

    # Log results after we're done all episodes.
    log_results(filename, path_log, loss_log, m)
Exemple #6
0
def train_net(model, params, mode='grid'):

    observe = 1000  # Number of frames to observe before training.
    epsilon = 1
    train_frames = 10000  # Number of frames to play.
    train_frames = TRAIN_FRAMES

    filename = params_to_filename(params, mode, train_frames)
    print(filename)

    if mode == 'lane_following':
        rate = 10  # Hz
        screen = pygame.display.set_mode((1300, 600))
        pygame.display.set_caption("mdeyo car sim")
        background = pygame.Surface(screen.get_size())
        background.fill((0, 0, 0))
        RED = (255, 0, 0)
        car = Car2(RED, 60, 385, screen, 100)
        road = CurvedRoad(1200, 60, 385, '45')
        car.constant_speed = True
        state = road.getState(car)
        print('state:', state)

    if mode == 'grid':
        # Create a new game instance.
        # game_state = carmunk.GameState()
        grid = Grid(X_DIM, Y_DIM)
        car = Car(grid, 0, 0)
        game_state = World(grid, car, 500, 10, False)
        # Get initial state by doing nothing and getting the state.
        _, state = game_state.updateState(0)

    batchSize = params['batchSize']
    buffer = params['buffer']

    # Just stuff used below.
    max_car_reward = -999999
    car_reward = 0
    t = 0
    data_collect = []
    replay = []  # stores tuples of (S, A, R, S')
    loss_log = []

    # Let's time it.
    start_time = timeit.default_timer()

    # Run the frames.
    while t < train_frames:

        t += 1

        if mode == 'grid':
            # Choose an action.
            if random.random() < epsilon or t < observe:
                action = np.random.randint(0, 3)  # random
            else:
                # Get Q values for each action.
                qval = model.predict(state, batch_size=1)
                action = (np.argmax(qval))  # best

            # Take action, observe new state and get our treat.
            #reward, new_state = game_state.frame_step(action)
            car_reward, new_state = game_state.updateState(action)
            # car_reward = reward
            # print(reward)

        elif mode == 'lane_following':
            # Choose an action.
            if random.random() < epsilon or t < observe:
                action = np.random.randint(0, 3)  # random
                # actions currently are 0 = no input (drive straight)
                #                       1 = left turn input
                #                       2 = right turn input
            else:
                # Get Q values for each action.
                qval = model.predict(state, batch_size=1)
                action = (np.argmax(qval))  # best

            # Take action, observe new state and get our treat.
            # print(action)
            car.takeAction(action)
            car.update(1 / rate)
            road.plotRoad(screen)

            new_state = road.getState(car)
            (car_reward, done) = road.reward(car)

            # --- Go ahead and update the screen with what we've drawn.
            pygame.display.flip()

            # --- Limit to 60 frames per second
            # clock.tick(rate)
            # print(car_reward)

        # Experience replay storage.
        print(t, 'reward', car_reward)
        # print('state:', state, 'action', action, 'reward',
        #       car_reward, 'new_state', new_state)
        replay.append((state, action, car_reward, new_state))

        # If we're done observing, start training.
        if t > observe:

            # If we've stored enough in our buffer, pop the oldest.
            if len(replay) > buffer:
                replay.pop(0)

            # Randomly sample our experience replay memory
            minibatch = random.sample(replay, batchSize)

            # Get training values.
            X_train, y_train = process_minibatch(minibatch, model)

            # Train the model on this batch.
            history = LossHistory()
            model.fit(X_train,
                      y_train,
                      batch_size=batchSize,
                      epochs=1,
                      verbose=0,
                      callbacks=[history])
            loss_log.append(history.losses)

        # Update the starting state with S'.
        state = new_state

        # print(state)
        # game_state.grid.printGrid()
        # print(reward)

        # Decrement epsilon over time.
        if epsilon > 0.1 and t > observe:
            epsilon -= (1 / train_frames)

        # We died, so update stuff.
        if done == 1:
            # if reward > 0 or reward==-999:
            # Log the car's distance at this T.
            data_collect.append([t, car_reward])

            # Update max.
            if car_reward > max_car_reward:
                max_car_reward = car_reward

            # Time it.
            tot_time = timeit.default_timer() - start_time
            # fps = car_distance / tot_time

            # Output some stuff so we can watch.
            print("Max: %d at %d\tepsilon %f\t(%d)\t" %
                  (max_car_reward, t, epsilon, car_reward))

            # Reset.
            car_reward = 0
            start_time = timeit.default_timer()

        # Save the model every 25,000 frames.
        if t % 100 == 0:
            print(t)
        if t % 2000 == 0:
            model.save_weights('saved-models/' + filename + '-' + str(t) +
                               '.h5',
                               overwrite=True)
            print("Saving model %s - %d" % (filename, t))

    # Log results after we're done all frames.
    print(train_frames)
    log_results(filename, data_collect, loss_log, train_frames, observe)
Exemple #7
0
def train_net(model, params):

    filename = params_to_filename(params)
    observe = 129  # Number of frames to observe before training.
    epsilon = 0.5
    train_frames = 50000  # Number of frames to play.
    steps = 0
    batchSize = params['batchSize']
    buffer = params['buffer']

    # Just stuff used below.
    max_car_distance = 0
    car_distance = 0
    t = 0
    data_collect = []
    replay = []  # stores tuples of (S, A, R, S'). #to be displayed

    loss_log = []

    # Create a new game instance.
    game_state = carmunk.GameState()

    # Get initial state by doing nothing and getting the state.
    _, state, _ = game_state.frame_step((2))

    # Let's time it.
    start_time = timeit.default_timer()

    # Run the frames.

    while t < train_frames:
        print(t)
        t += 1
        car_distance += 1

        # Choose an action.
        if random.random() < epsilon or t < observe:
            action = np.random.randint(0, 5)  # random
        else:
            # Get Q values for each action.
            print("PREDICTED", state)
            # time.sleep(1)
            x = state[0]
            y = state[1]
            qval = model.predict(np.array([x, y]).reshape((1, 2)),
                                 batch_size=1)
            action = (np.argmax(qval))  # best

        # Take action, observe new state and get our treat.
        reward, new_state, term = game_state.frame_step(action)
        print("timestep :" + str(t) + "Reward" + str(reward) + "action" +
              str(action) + "state" + str(state))
        # Experience replay storage.
        replay.append((state, action, reward, new_state))
        # print(len(replay))
        # If we're done observing, start training.
        if t > observe:
            #print("start")
            # If we've stored enough in our buffer, pop the oldest.
            if len(replay) > buffer:
                replay.pop(0)

            # Randomly sample our experience replay memory
            minibatch = random.sample(replay, batchSize)

            # Get training values.
            X_train, y_train = process_minibatch2(minibatch, model)
            # Train the model on this batch.
            history = LossHistory()
            model.fit(X_train,
                      y_train,
                      batch_size=batchSize,
                      verbose=0,
                      callbacks=[history])
            loss_log.append(history.losses)
            steps += 1
            if steps % 1000 == 0:
                print("Step = " + str(steps), "Epsilon = " + str(epsilon))
        # Update the starting state with S'.
        state = new_state

        # Decrement epsilon over time.
        if epsilon > 0.1 and t > observe:
            epsilon -= (10.0 / train_frames)
            print("EPSILON UPDATED", epsilon)

        # We died, so update stuff.
        if term == 1:
            # print("Crashed.")
            # Log the car's distance at this T.
            data_collect.append([t, car_distance])
            continue
            # Reset.
            car_distance = 0
        # We reached the goal, so update stuff.
        elif term == 2:
            print("Reached goal.", car_distance)
            # Log the car's distance at this T.
            data_collect.append([t, car_distance])
            continue
            # Reset.
            car_distance = 0

        # Save the model every 25,000 frames.
        if t % 25000 == 0:
            model.save_weights('saved-models/' + filename + '-' + str(t) +
                               '.h5',
                               overwrite=True)
            print("Saving model %s - %d" % (filename, t))
        # if(keyboard.is_pressed('8')):
        #     print("Reset Goal")
        #     game_state.reset_goal()
        print(t, reward, action)
Exemple #8
0
def train_net(model, params):

    filename = params_to_filename(params)

    observe = 1000  # Number of frames to observe before training.
    epsilon = 1
    train_frames = 1002  # Number of frames to play.
    reward = 0
    death = 0
    printstuff = ''
    batchSize = params['batchSize']
    buffer = params['buffer']

    # Just stuff used below.
    max_car_distance = 0
    car_distance = 0
    max_reward = 0
    t = 0
    data_collect = []
    replay = []  # stores tuples of (S, A, R, S').

    loss_log = []

    # Create a new game instance.
    game_state = carmunkStatic.GameState()

    # Get initial state by doing nothing and getting the state.
    _, state, nothing = game_state.frame_step((2))

    # Let's time it.
    start_time = timeit.default_timer()

    # Run the frames.
    while t < train_frames:

        t += 1
        car_distance += 1

        # Choose an action.
        if random.random() < epsilon or t < observe:
            action = np.random.randint(0, 4)  # random 0-1-2
        else:
            # Get Q values for each action.
            qval = model.predict(state, batch_size=1)
            action = (np.argmax(qval))  # best

        # Take action, observe new state and get our treat.
        reward, new_state, printstuff = game_state.frame_step(action)

        # Experience replay storage.
        replay.append((state, action, reward, new_state))

        # If we're done observing, start training.
        if t > observe:

            # If we've stored enough in our buffer, pop the oldest.
            if len(replay) > buffer:
                replay.pop(0)

            # Randomly sample our experience replay memory
            minibatch = random.sample(replay, batchSize)

            # Get training values.
            X_train, y_train = process_minibatch(minibatch, model)

            # Train the model on this batch.
            history = LossHistory()
            model.fit(X_train,
                      y_train,
                      batch_size=batchSize,
                      nb_epoch=1,
                      verbose=0,
                      callbacks=[history])
            loss_log.append(history.losses)

        # Update the starting state with S'.
        state = new_state

        # Decrement epsilon over time.
        if epsilon > 0.1 and t > observe:
            epsilon -= (1 / train_frames)

        #Update max
        if reward > max_reward:
            max_reward = reward

        # We died, so update stuff.
        if reward == -500:
            # Log the car's distance at this T.
            data_collect.append([t, car_distance])

            # Update max.
            if car_distance > max_car_distance:
                max_car_distance = car_distance

            # Time it.
            tot_time = timeit.default_timer() - start_time
            fps = car_distance / tot_time

            # Output some stuff so we can watch.
            print("Max: %d at %d\tepsilon %f\t(%d)\t%f fps" %
                  (max_car_distance, t, epsilon, car_distance, fps))

            print("Max reward : %d", max_reward)

            # Reset.
            car_distance = 0
            start_time = timeit.default_timer()

            #update death
            death += 1
            if t > observe & death > 10:
                return
    print(printstuff)
Exemple #9
0
def train_net(best_action_model, params):

    filename = params_to_filename(params)

    observe = 1000  # Number of frames to observe before training.
    epsilon = 1
    train_frames = 500000  # Number of frames to play. was 1000000
    batchSize = params['batchSize']
    buffer = params['buffer']

    # Just stuff used below.
    max_car_distance = 0
    car_distance = 0
    t = 0
    cum_rwd = 0
    cum_rwd_read = 0
    cum_rwd_dist = 0
    cum_rwd_speed = 0

    data_collect = []
    replay = []  # stores tuples of (S, A, R, S').
    save_init = True
    loss_log = []

    # Create a new game instance.
    game_state = carmunk.GameState()

    # Get initial state by doing nothing and getting the state.
    state, new_reward, cur_speed, _, _, _ = game_state.frame_step(
        START_ACTION, START_SPEED, START_DISTANCE)

    # frame_step returns reward, state, speed
    #state = state_frames(state, np.array([[0, 0, 0, 0, 0, 0, 0]])) # zeroing distance readings
    #state = state_frames(state, np.zeros((1,NUM_SENSORS))) # zeroing distance readings

    # Let's time it.
    start_time = timeit.default_timer()

    # Run the frames.
    while t < train_frames:

        #time.sleep(0.5)

        t += 1
        car_distance += 1

        # Choose an action.
        if random.random() < epsilon or t < observe:
            action = np.random.randint(0, NUM_OUTPUT)  # random
        else:
            # Get Q values for each action
            qval = best_action_model.predict(state, batch_size=1)
            # best_action_model was passed to this function. call it w/ current state
            action = (np.argmax(qval))  # best prediction

        # Take action, observe new state and get our treat.
        new_state, new_reward, new_speed, new_rwd_read, new_rwd_dist, new_rwd_speed = \
            game_state.frame_step(action, cur_speed, car_distance)

        # Use multiple frames.
        #new_state = state_frames(new_state, state) # seems this is appending 2-3 moves, results

        # Experience replay storage.
        replay.append((state, action, new_reward, new_state))

        # If we're done observing, start training.
        if t > observe:

            # If we've stored enough in our buffer, pop the oldest.
            if len(replay) > buffer:
                replay.pop(0)

            # Randomly sample our experience replay memory
            minibatch = random.sample(replay, batchSize)
            # WHY RANDOM SAMPLE? COULD TRAINING BE SPED UP BY TAKING LAST BATCHSIZE

            # Get training values.
            X_train, y_train = process_minibatch(minibatch, best_action_model)

            # Train the best_action_model on this batch.
            history = LossHistory()
            best_action_model.fit(X_train,
                                  y_train,
                                  batch_size=batchSize,
                                  nb_epoch=1,
                                  verbose=0,
                                  callbacks=[history])
            loss_log.append(history.losses)

        # Update the starting state with S'.
        state = new_state
        cur_speed = new_speed
        cum_rwd += new_reward
        cum_rwd_read += new_rwd_read
        cum_rwd_dist += new_rwd_dist
        cum_rwd_speed += new_rwd_speed

        # Decrement epsilon over time.
        if epsilon > 0.1 and t > observe:
            epsilon -= (1 / train_frames)

        # We died, so update stuff.
        if new_reward == -500 or new_reward == -1000:
            # Log the car's distance at this T.
            data_collect.append([t, car_distance])

            # Update max.
            if car_distance > max_car_distance:
                max_car_distance = car_distance

            # Time it.
            tot_time = timeit.default_timer() - start_time
            fps = car_distance / tot_time

            # Output some stuff so we can watch.
            print("Max: %d at %d\t eps: %f\t dist: %d\t rwd: %d\t read: %d\t dist: %d\t speed: %d\t fps: %d" %
                  (max_car_distance, t, epsilon, car_distance, cum_rwd, \
                   cum_rwd_read, cum_rwd_dist, cum_rwd_speed, int(fps)))

            # Reset.
            car_distance = 0
            cum_rwd = 0
            cum_rwd_read = 0
            cum_rwd_dist = 0
            cum_rwd_speed = 0
            start_time = timeit.default_timer()

        # Save early best_action_model, then every 20,000 frames
        if t % 50000 == 0:
            save_init = False
            best_action_model.save_weights('saved-best_action_models/' +
                                           filename + '-' + str(t) + '.h5',
                                           overwrite=True)
            print("Saving best_action_model %s - %d" % (filename, t))

    # Log results after we're done all frames.
    log_results(filename, data_collect, loss_log)
Exemple #10
0
def train_net(model, params):

    filename = params_to_filename(params)

    observe = 1000  # Number of frames to observe before training.
    epsilon = 1
    train_frames = 300000  # Number of frames to play.
    batchSize = params['batchSize']
    buffer = params['buffer']

    # Just stuff used below.
    max_car_distance = 0
    car_distance = 0

    #needed to print information
    global max_reward
    global stuff
    global b_state
    global max_qVal
    frame = 0
    t = 0
    data_collect = []
    replay = []  # stores tuples of (S, A, R, S').

    loss_log = []

    # Create a new game instance.
    game_state = carmunk.GameState()

    # Get initial state by doing nothing and getting the state.
    _, state, stuff = game_state.frame_step((2))

    # Let's time it.
    start_time = timeit.default_timer()

    # Run the frames.
    while t < train_frames:

        t += 1
        frame += 1
        car_distance += 1

        # Choose an action.
        if random.random() < epsilon or t < observe:
            action = np.random.randint(0, 4)  # random 0-1-2-3
        else:
            # Get Q values for each action.
            qval = model.predict(state, batch_size=1)
            action = (np.argmax(qval))  # best

        # Take action, observe new state and get our treat.
        reward, new_state, somestuff = game_state.frame_step(action)
        if reward > max_reward:
            stuff = somestuff
        # Experience replay storage.
        replay.append((state, action, reward, new_state))

        # If we're done observing, start training.
        if t > observe:

            # If we've stored enough in our buffer, pop the oldest.
            if len(replay) > buffer:
                replay.pop(0)

            # Randomly sample our experience replay memory
            minibatch = random.sample(replay, batchSize)

            # Get training values.
            X_train, y_train = process_minibatch(minibatch, model)

            # Train the model on this batch.
            history = LossHistory()
            model.fit(X_train,
                      y_train,
                      batch_size=batchSize,
                      nb_epoch=1,
                      verbose=0,
                      callbacks=[history])
            loss_log.append(history.losses)

        # Update the starting state with S'.
        state = new_state

        # Decrement epsilon over time.
        if epsilon > 0.1 and t > observe:
            epsilon -= (1 / train_frames)

        # We died, so update stuff.
        if reward == -500:
            # Log the car's distance at this T.
            data_collect.append([t, car_distance])

            # Update max.
            if car_distance > max_car_distance:
                max_car_distance = car_distance

            # Time it.
            tot_time = timeit.default_timer() - start_time
            fps = car_distance / tot_time

            # Output some stuff so we can watch.
            print("\n\nMax distance: %d at %d\nepsilon %f\n(%d)\n%f fps" %
                  (max_car_distance, t, epsilon, car_distance, fps))
            print("\n Max reward : %d\t,\n max qVal : %d\t" %
                  (max_reward, max_qVal))
            print('best state', b_state)
            print(stuff)
            print("\n frame:", frame)
            # Reset.
            max_reward = 0
            stuff = ''
            car_distance = 0
            max_qVal = 0
            b_state = [0, 0, 0, 0, 0, 0, 0, 0]

            start_time = timeit.default_timer()

        # Save the model every 25,000 frames.
        if t % 25000 == 0:
            model.save_weights('saved-models/BLE/final/' + 'FINAL' + filename +
                               '-' + str(t) + '.h5',
                               overwrite=True)
            print("Saving model %s - %d" % (filename, t))

    # Log results after we're done all frames.
    log_results(filename, data_collect, loss_log)
Exemple #11
0
def train_net(model, params):

    filename = params_to_filename(params)

    train_frames = 300000  # Number of frames to play.
    batchSize = params['batchSize']
    buffer = params['buffer']

    # Just stuff used below.
    t = 0
    replay = []  # stores tuples of (S, A, R, S').

    loss_log = []

    # Create a new game instance.
    game_state = flappy.Game()
    game_state.init_elements()

    # Get initial state by doing nothing and getting the state.
    state, _ = game_state.frame_step(0)

    # Run the frames.
    while t < train_frames:

        t += 1

        # Choose an action.
        qval = model.predict(np.array([state]))[0]
        action = (np.argmax(qval))  # best
        if t % 500 == 0:
            print(qval)

        # Take action, observe new state and get our treat.
        new_state, reward = game_state.frame_step(action)
        if t % 1000 == 0:
            print(t, action, state, reward)

        # Experience replay storage.
        replay.append((state, action, reward, new_state))

        # If we're done observing, start training.
        if t > batchSize:

            # If we've stored enough in our buffer, pop the oldest.
            if len(replay) > buffer:
                replay.pop(0)

            # Randomly sample our experience replay memory
            minibatch = random.sample(replay, batchSize)

            # Get training values.
            X_train, y_train = process_minibatch(minibatch, model)

            # Train the model on this batch.
            history = LossHistory()
            model.fit(X_train,
                      y_train,
                      batch_size=batchSize,
                      nb_epoch=1,
                      verbose=0,
                      callbacks=[history])
            loss_log.append(history.losses)

        # Update the starting state with S'.
        state = new_state

        if reward == -1000:
            game_state.init_elements()
            state, _ = game_state.frame_step(0)

        # Save the model every 2500 frames.
        if t % 25000 == 0:
            model.save_weights('results/saved-models/' + filename + '-' +
                               str(t) + '.h5',
                               overwrite=True)
            print("Saving model %s - %d" % (filename, t))

        if t % 50000 == 0:
            # Log results after we're done all frames.
            log_results(filename, loss_log)
	def train(self,state,simulator):
		self.t+=1
		if random.random()<self.epsilon or self.t<self.observe:
			action = np.random.randint(0, 4)
		else:
			# Get Q values for each action.
			qval = self.model.predict(state, batch_size=1)
			action = (np.argmax(qval)) 
		# Take action, observe new state and get our treat.
		simulator.applyAction(action)
		reward, new_state = simulator.statusVector()
		# Experience replay storage.
		self.replay.append((state, action, reward, new_state))
		if t > observe:
			# If we've stored enough in our buffer, pop the oldest.
			if len(self.replay) > buffer:
				self.replay.pop(0)
			# Randomly sample our experience replay memory
			minibatch = random.sample(self.replay, self.batchSize)
			# Get training values.
			X_train, y_train = process_minibatch2(minibatch, self.model)
			# Train the model on this batch.
			history = LossHistory()
			model.fit(
				X_train, y_train, batch_size=batchSize,
				nb_epoch=1, verbose=0, callbacks=[history]
			)
					# Decrement epsilon over time.
		if self.epsilon > 0.1 and self.t > self.observe:
			self.epsilon -= (1.0/train_frames)

		if self.t % 25000 == 0:
			self.model.save_weights('saved-models/' + self._filename + '-' +
							   str(self.t) + '.h5',
							   overwrite=True)
			print("Saving model %s - %d" % (self._filename, self.t))
		'''TODO need to change to class functions'''
		def process_minibatch2(minibatch, model):
			# by Microos, improve this batch processing function 
			#   and gain 50~60x faster speed (tested on GTX 1080)
			#   significantly increase the training FPS
			
			# instead of feeding data to the model one by one, 
			#   feed the whole batch is much more efficient

			mb_len = len(minibatch)

			old_states = np.zeros(shape=(mb_len, 5))
			actions = np.zeros(shape=(mb_len,))
			rewards = np.zeros(shape=(mb_len,))
			new_states = np.zeros(shape=(mb_len, 5))

			for i, m in enumerate(minibatch):
				old_state_m, action_m, reward_m, new_state_m = m
				old_states[i, :] = old_state_m[...]
				actions[i] = action_m
				rewards[i] = reward_m
				new_states[i, :] = new_state_m[...]

			old_qvals = model.predict(old_states, batch_size=mb_len)
			new_qvals = model.predict(new_states, batch_size=mb_len)

			maxQs = np.max(new_qvals, axis=1)
			y = old_qvals
			non_term_inds = np.where(rewards != -500)[0]
			term_inds = np.where(rewards == -500)[0]

			y[non_term_inds, actions[non_term_inds].astype(int)] = rewards[non_term_inds] + (GAMMA * maxQs[non_term_inds])
			y[term_inds, actions[term_inds].astype(int)] = rewards[term_inds]

			X_train = old_states
			y_train = y
			return X_train, y_train
def train_net(turn_model, turn_model_30, turn_model_50, turn_model_70,
              avoid_model, acquire_model, acquire_model_30, acquire_model_50,
              acquire_model_70, hunt_model, pack_model, params):

    filename = params_to_filename(params)

    if cur_mode in [TURN, HUNT, PACK]:
        observe = 2000  # Number of frames to observe before training.
    else:
        observe = 2000

    epsilon = 1  # vary this based on pre-learning already occurred in lower models
    train_frames = 750000  # number of flips for training
    batchSize = params['batchSize']
    buffer = params['buffer']

    # initialize variables and structures used below.
    max_crash_frame_ctr = 0
    crash_frame_ctr = 0
    total_frame_ctr = 0
    replay_frame_ctr = 0
    stop_ctr = 0
    avoid_ctr = 0
    acquire_ctr = 0
    cum_rwd = 0
    cum_speed = 0

    data_collect = []
    replay = []
    loss_log = []  # replay stores state, action, reward, new state
    save_init = True
    cur_speeds = []
    for i in range(NUM_DRONES):
        cur_speeds.append(START_SPEED)

    # initialize drone state holders
    turn_states = np.zeros(
        [NUM_DRONES, TURN_TOTAL_SENSORS * TURN_STATE_FRAMES])
    avoid_states = np.zeros(
        [NUM_DRONES, AVOID_TOTAL_SENSORS * AVOID_STATE_FRAMES])
    acquire_states = np.zeros(
        [NUM_DRONES, ACQUIRE_NUM_SENSOR * ACQUIRE_STATE_FRAMES])
    hunt_states = np.zeros(
        [NUM_DRONES, HUNT_TOTAL_SENSORS * HUNT_STATE_FRAMES])
    drone_states = np.zeros(
        [NUM_DRONES, DRONE_TOTAL_SENSOR * PACK_STATE_FRAMES])

    # create game instance
    game_state = carmunk.GameState()

    # get initial state(s)
    turn_state, avoid_state, acquire_state, hunt_state, drone_state, reward, cur_speed = \
        game_state.frame_step(START_DRONE_ID, START_TURN_ACTION, START_SPEED_ACTION,
                              START_PACK_ACTION, START_SPEED, START_DISTANCE, 1)

    # initialize frame states
    if cur_mode in [TURN, AVOID, HUNT, PACK]:

        for i in range(NUM_DRONES):
            turn_states[i] = state_frames(
                turn_state,
                np.zeros((1, TURN_TOTAL_SENSORS * TURN_STATE_FRAMES)),
                TURN_TOTAL_SENSORS, TURN_STATE_FRAMES)

        if cur_mode in [AVOID, HUNT, PACK]:

            for i in range(NUM_DRONES):
                avoid_states[i] = state_frames(
                    avoid_state,
                    np.zeros((1, AVOID_TOTAL_SENSORS * AVOID_STATE_FRAMES)),
                    AVOID_TOTAL_SENSORS, AVOID_STATE_FRAMES)

    if cur_mode in [ACQUIRE, HUNT, PACK]:

        for i in range(NUM_DRONES):
            acquire_states[i] = state_frames(
                acquire_state,
                np.zeros((1, ACQUIRE_NUM_SENSOR * ACQUIRE_STATE_FRAMES)),
                ACQUIRE_NUM_SENSOR, ACQUIRE_STATE_FRAMES)

    if cur_mode in [HUNT, PACK]:

        for i in range(NUM_DRONES):
            hunt_states[i] = state_frames(
                hunt_state,
                np.zeros((1, HUNT_TOTAL_SENSORS * HUNT_STATE_FRAMES)),
                HUNT_TOTAL_SENSORS, HUNT_STATE_FRAMES)

    if cur_mode == PACK:

        for i in range(NUM_DRONES):
            drone_states[i] = state_frames(
                drone_state,
                np.zeros((1, DRONE_TOTAL_SENSOR * PACK_STATE_FRAMES)),
                DRONE_TOTAL_SENSOR, PACK_STATE_FRAMES)

        pack_state = state_frames(
            drone_state, np.zeros((1, PACK_TOTAL_SENSORS * PACK_STATE_FRAMES)),
            PACK_TOTAL_SENSORS, PACK_STATE_FRAMES)

    # time it
    start_time = timeit.default_timer()

    # run frames
    while total_frame_ctr < train_frames:

        total_frame_ctr += 1  # counts total training distance traveled
        crash_frame_ctr += 1  # counts distance between crashes
        replay_frame_ctr += 1  # counts frames between pack mode replay captures

        # used to slow things down for de-bugging
        #time.sleep(0.25)

        for drone_id in range(
                NUM_DRONES):  # NUM_DRONES = 1, unless you're in PACK mode

            speed_action = START_SPEED_ACTION

            # choose appropriate action(s)
            # note: only generates random inputs for currently training model.
            # all prior (sub) models provide their best (fully-trained) inputs
            if random.random(
            ) < epsilon or total_frame_ctr < observe:  # epsilon degrades over flips...
                if cur_mode == TURN:
                    turn_action = set_turn_action(
                        True, cur_speeds[drone_id],
                        np.array([turn_states[drone_id]]))
                else:
                    if cur_mode in [AVOID, HUNT, PACK]:
                        turn_action, turn_model = set_turn_action(
                            False, cur_speeds[drone_id],
                            np.array([turn_states[drone_id]]))

                    if cur_mode == AVOID:
                        speed_action = set_avoid_action(
                            True, turn_action,
                            np.array([avoid_states[drone_id]]))
                    else:
                        if cur_mode in [HUNT, PACK]:
                            speed_action = set_avoid_action(
                                False, turn_action,
                                np.array([avoid_states[drone_id]]))

                        if cur_mode == ACQUIRE:
                            acquire_action = set_acquire_action(
                                True, cur_speeds[drone_id],
                                np.array([acquire_states[drone_id, ]]))
                            turn_action = acquire_action
                        else:
                            acquire_action, acquire_model = set_acquire_action(
                                False, cur_speeds[drone_id],
                                np.array([acquire_states[drone_id, ]]))

                            if cur_mode == HUNT:
                                hunt_action, turn_action, speed_action = set_hunt_action(
                                    True, cur_speeds[drone_id], turn_action,
                                    speed_action, acquire_action,
                                    np.array([hunt_states[drone_id, ]]))
                            else:
                                hunt_action, turn_action, speed_action = set_hunt_action(
                                    False, cur_speeds[drone_id], turn_action,
                                    speed_action, acquire_action,
                                    np.array([hunt_states[drone_id, ]]))

                                if cur_mode == PACK and (
                                        total_frame_ctr == 1 or
                                    (replay_frame_ctr - 1) % PACK_EVAL_FRAMES
                                        == 0) and drone_id == 0:
                                    pack_action = set_pack_action(
                                        True, pack_state)
                                    # note: pack action only changed every PACK_EVAL_FRAMES.
                                    # for frames in between it's constant

            else:  # ...increasing use of predictions over time
                if cur_mode == TURN:
                    turn_action, turn_model = set_turn_action(
                        False, cur_speeds[drone_id],
                        np.array([turn_states[drone_id]]))
                else:
                    if cur_mode in [AVOID, HUNT, PACK]:
                        turn_action, turn_model = set_turn_action(
                            False, cur_speeds[drone_id],
                            np.array([turn_states[drone_id]]))

                    if cur_mode == AVOID:
                        speed_action = set_avoid_action(
                            False, turn_action,
                            np.array([avoid_states[drone_id]]))
                    else:
                        if cur_mode in [HUNT, PACK]:
                            speed_action = set_avoid_action(
                                False, turn_action,
                                np.array([avoid_states[drone_id]]))

                        if cur_mode == ACQUIRE:
                            acquire_action, acquire_model = set_acquire_action(
                                False, cur_speeds[drone_id],
                                np.array([acquire_states[drone_id, ]]))
                            turn_action = acquire_action
                        else:
                            acquire_action, acquire_model = set_acquire_action(
                                False, cur_speeds[drone_id],
                                np.array([acquire_states[drone_id, ]]))

                            if cur_mode == HUNT:
                                hunt_action, turn_action, speed_action = set_hunt_action(
                                    False, cur_speeds[drone_id], turn_action,
                                    speed_action, acquire_action,
                                    np.array([hunt_states[drone_id, ]]))
                            else:
                                hunt_action, turn_action, speed_action = set_hunt_action(
                                    False, cur_speeds[drone_id], turn_action,
                                    speed_action, acquire_action,
                                    np.array([hunt_states[drone_id, ]]))

                                if cur_mode == PACK and (
                                        total_frame_ctr == 1 or
                                    (replay_frame_ctr - 1) % PACK_EVAL_FRAMES
                                        == 0) and drone_id == 0:
                                    # get 1 pack action for each set of drones on first drone
                                    pack_action = set_pack_action(
                                        False, pack_state)
                                    print(pack_action)

            #print("++++++ pack action:", pack_action)
            #print(2)
            # pass action, receive new state, reward
            new_turn_state, new_avoid_state, new_acquire_state, new_hunt_state, new_drone_state, new_reward, new_speed = game_state.frame_step(
                drone_id, turn_action, speed_action, pack_action,
                cur_speeds[drone_id], total_frame_ctr, replay_frame_ctr)

            #print("********** 2. new states / rewards:")
            #print(total_frame_ctr)
            #print(drone_id)
            #print(new_drone_state)
            #print(new_reward)

            #print(3)
            # append (horizontally) historical states for learning speed.
            """ note: do this concatination even for models that are not learning (e.g., turn when running search or turn, search and acquire while running hunt) b/c their preds, performed above, expect the same multi-frame view that was in place when they trained."""

            if cur_mode in [TURN, AVOID, HUNT, PACK]:
                new_turn_state = state_frames(
                    new_turn_state, np.array([turn_states[drone_id]]),
                    TURN_TOTAL_SENSORS, TURN_STATE_FRAMES)

            if cur_mode in [AVOID, HUNT, PACK]:
                new_avoid_state = state_frames(
                    new_avoid_state, np.array([avoid_states[drone_id]]),
                    AVOID_TOTAL_SENSORS, AVOID_STATE_FRAMES)

            if cur_mode in [ACQUIRE, HUNT, PACK]:
                new_acquire_state = state_frames(
                    new_acquire_state, np.array([acquire_states[drone_id]]),
                    ACQUIRE_NUM_SENSOR, ACQUIRE_STATE_FRAMES)

            if cur_mode in [HUNT, PACK]:
                new_hunt_state = state_frames(
                    new_hunt_state, np.array([hunt_states[drone_id]]),
                    HUNT_TOTAL_SENSORS, HUNT_STATE_FRAMES)

            #print(4)
            if cur_mode == PACK and (total_frame_ctr == 1 or
                                     replay_frame_ctr % PACK_EVAL_FRAMES == 0):
                if drone_id == 0:  # for 1st drone, pack state = drone state
                    new_pack_state = new_drone_state
                    pack_rwd = new_reward

                else:  # otherwise, append drone record to prior drone state
                    new_pack_state = state_frames(new_pack_state,
                                                  new_drone_state,
                                                  DRONE_TOTAL_SENSOR, 2)
                    pack_rwd += new_reward

                new_drone_state = state_frames(
                    new_drone_state, np.array([drone_states[drone_id]]),
                    DRONE_TOTAL_SENSOR, PACK_STATE_FRAMES)

                if drone_id == (NUM_DRONES -
                                1):  # for last drone build pack record
                    if total_frame_ctr == 1:
                        pack_state = np.zeros(
                            (1, PACK_TOTAL_SENSORS * PACK_STATE_FRAMES))

                    new_pack_state = state_frames(
                        new_pack_state, pack_state, PACK_TOTAL_SENSORS,
                        PACK_STATE_FRAMES
                    )  #may need to add 1 to PACK_STATE_FRAMES

                    #print("**** 3. final pack reward:")
                    #print(pack_rwd)

            #print(5)
            # experience replay storage
            """note: only the model being trained requires event storage as it is stack that will be sampled for training below."""
            if cur_mode == TURN:
                replay.append((np.array([turn_states[drone_id]]), turn_action,
                               new_reward, new_turn_state))

            elif cur_mode == AVOID:
                replay.append((np.array([avoid_states[drone_id]]),
                               speed_action, new_reward, new_avoid_state))

            elif cur_mode == ACQUIRE:
                replay.append((np.array([acquire_states[drone_id]]),
                               turn_action, new_reward, new_acquire_state))

            elif cur_mode == HUNT:
                replay.append((np.array([hunt_states[drone_id]]), hunt_action,
                               new_reward, new_hunt_state))

            elif cur_mode == PACK and (total_frame_ctr == 1
                                       or replay_frame_ctr % PACK_EVAL_FRAMES
                                       == 0) and drone_id == (NUM_DRONES - 1):
                replay.append(
                    (pack_state, pack_action, pack_rwd, new_pack_state))
                #print(replay[-1])

            #print("6a")
            # If we're done observing, start training.
            if total_frame_ctr > observe and (
                    cur_mode != PACK or
                (replay_frame_ctr % PACK_EVAL_FRAMES == 0
                 and drone_id == (NUM_DRONES - 1))):

                # If we've stored enough in our buffer, pop the oldest.
                if len(replay) > buffer:
                    replay.pop(0)

                # Randomly sample our experience replay memory
                minibatch = random.sample(replay, batchSize)

                if cur_mode == TURN:
                    # Get training values.
                    X_train, y_train = process_minibatch(
                        minibatch, turn_model, TURN_NUM_INPUT, TURN_NUM_OUTPUT)
                    history = LossHistory()
                    turn_model.fit(X_train,
                                   y_train,
                                   batch_size=batchSize,
                                   nb_epoch=1,
                                   verbose=0,
                                   callbacks=[history])

                elif cur_mode == AVOID:
                    X_train, y_train = process_minibatch(
                        minibatch, avoid_model, AVOID_NUM_INPUT,
                        AVOID_NUM_OUTPUT)
                    history = LossHistory()
                    avoid_model.fit(X_train,
                                    y_train,
                                    batch_size=batchSize,
                                    nb_epoch=1,
                                    verbose=0,
                                    callbacks=[history])

                elif cur_mode == ACQUIRE:
                    X_train, y_train = process_minibatch(
                        minibatch, acquire_model, ACQUIRE_NUM_INPUT,
                        ACQUIRE_NUM_OUTPUT)
                    history = LossHistory()
                    acquire_model.fit(X_train,
                                      y_train,
                                      batch_size=batchSize,
                                      nb_epoch=1,
                                      verbose=0,
                                      callbacks=[history])

                elif cur_mode == HUNT:
                    X_train, y_train = process_minibatch(
                        minibatch, hunt_model, HUNT_NUM_INPUT, HUNT_NUM_OUTPUT)
                    history = LossHistory()
                    hunt_model.fit(X_train,
                                   y_train,
                                   batch_size=batchSize,
                                   nb_epoch=1,
                                   verbose=0,
                                   callbacks=[history])

                elif cur_mode == PACK:
                    X_train, y_train = process_minibatch(
                        minibatch, pack_model, PACK_NUM_INPUT, PACK_NUM_OUTPUT)
                    history = LossHistory()
                    pack_model.fit(X_train,
                                   y_train,
                                   batch_size=batchSize,
                                   nb_epoch=1,
                                   verbose=0,
                                   callbacks=[history])

                loss_log.append(history.losses)

            # Update the starting state with S'.
            if cur_mode in [TURN, AVOID, HUNT, PACK]:
                turn_states[drone_id] = new_turn_state

            if cur_mode in [AVOID, HUNT, PACK]:
                avoid_states[drone_id] = new_avoid_state

            if cur_mode in [ACQUIRE, HUNT, PACK]:
                acquire_states[drone_id] = new_acquire_state

            if cur_mode in [HUNT, PACK]:
                hunt_states[drone_id] = new_hunt_state

            if cur_mode == PACK and (total_frame_ctr == 1 or
                                     replay_frame_ctr % PACK_EVAL_FRAMES == 0):
                drone_states[drone_id] = new_drone_state

                if drone_id == (NUM_DRONES - 1):
                    pack_state = new_pack_state
                    replay_frame_ctr = 0

            cur_speeds[drone_id] = new_speed
            cum_rwd += new_reward

            # in case of crash, report and initialize
            if new_reward == -500 or new_reward == -1000:
                # Log the car's distance at this T.
                data_collect.append([total_frame_ctr, crash_frame_ctr])

                # Update max.
                if crash_frame_ctr > max_crash_frame_ctr:
                    max_crash_frame_ctr = crash_frame_ctr

                # Time it.
                tot_time = timeit.default_timer() - start_time
                fps = crash_frame_ctr / tot_time

                # Output some stuff so we can watch.
                #try:
                print(
                    "Max: %d at %d\t eps: %f\t dist: %d\t mode: %d\t cum rwd: %d\t fps: %d"
                    % (max_crash_frame_ctr, total_frame_ctr, epsilon,
                       crash_frame_ctr, cur_mode, cum_rwd, int(fps)))
                #    break
                #except (RuntimeError, TypeError, NameError):
                #    pass

                # Reset.
                crash_frame_ctr = cum_rwd = cum_speed = 0
                start_time = timeit.default_timer()

        #print(9)
        # decrement epsilon for another frame
        if epsilon > 0.1 and total_frame_ctr > observe:
            epsilon -= (1 / train_frames)

        if total_frame_ctr % 10000 == 0:
            if crash_frame_ctr != 0:
                #try:
                print(
                    "Max: %d at %d\t eps: %f\t dist: %d\t mode: %d\t cum rwd: %d"
                    % (max_crash_frame_ctr, total_frame_ctr, epsilon,
                       crash_frame_ctr, cur_mode, cum_rwd))
                #    break
                #except (RuntimeError, TypeError, NameError):
                #pass

        # Save model every 50k frames
        if total_frame_ctr % 50000 == 0:
            save_init = False
            if cur_mode == TURN:
                turn_model.save_weights('models/turn/turn-' + filename + '-' +
                                        str(START_SPEED) + '-' +
                                        str(total_frame_ctr) + '.h5',
                                        overwrite=True)
                print("Saving turn_model %s - %d - %d" %
                      (filename, START_SPEED, total_frame_ctr))

            elif cur_mode == AVOID:
                avoid_model.save_weights('models/avoid/avoid-' + filename +
                                         '-' + str(total_frame_ctr) + '.h5',
                                         overwrite=True)
                print("Saving avoid_model %s - %d" %
                      (filename, total_frame_ctr))

            elif cur_mode == ACQUIRE:
                acquire_model.save_weights('models/acquire/acquire-' +
                                           filename + '-' + str(START_SPEED) +
                                           '-' + str(total_frame_ctr) + '.h5',
                                           overwrite=True)
                print("Saving acquire_model %s - %d" %
                      (filename, total_frame_ctr))

            elif cur_mode == HUNT:
                hunt_model.save_weights('models/hunt/hunt-' + filename + '-' +
                                        str(total_frame_ctr) + '.h5',
                                        overwrite=True)
                print("Saving hunt_model %s - %d" %
                      (filename, total_frame_ctr))

            elif cur_mode == PACK:
                pack_model.save_weights('models/pack/pack-' + filename + '-' +
                                        str(total_frame_ctr) + '.h5',
                                        overwrite=True)
                print("Saving pack_model %s - %d" %
                      (filename, total_frame_ctr))

    # Log results after we're done all frames.
    log_results(filename, data_collect, loss_log)
def train_net(model, params):
    global counter
    global lastState
    global last_action
    global lastreward
    filename = params_to_filename(params)

    observe = 1000  # Number of frames to observe before training.p
    epsilon = 1
    train_frames = 1000000  # Number of frames to play.
    batchSize = params['batchSize']
    buffer = params['buffer']

    # Just stuff used below.
    max_car_distance = 0
    car_distance = 0
    t = 0
    data_collect = []
    replay = []  # stores tuples of (S, A, R, S').

    loss_log = []

    # Create a new game instance.
    game_state = carmunk.GameState()

    # Get initial state by doing nothing and getting the state.
    _, state = game_state.frame_step((2))
    # state = np.array([14,14,14,14,14,14,14,14,14])
    # state = np.expand_dims(state, axis = 0)
    # Let's time it.
    start_time = timeit.default_timer()

    # Run the frames.
    while t < train_frames:

        t += 1
        car_distance += 1

        # Choose an action.
        if random.random() < epsilon or t < observe:
            action = np.random.randint(0, 5)  # random
        else:
            # Get Q values for each action.
            qval = model.predict(train_new_state, batch_size=1)
            action = (np.argmax(qval))  # best

        # Take action, observe new state and get our treat.
        if lastreward < -100:
            lastState = state
        train_state = np.append(lastState, state[0])
        train_state = np.append(train_state, last_action)
        train_state = np.expand_dims(train_state, axis=0)

        reward, new_state = game_state.frame_step(action)
        train_new_state = np.append(state[0], new_state[0])
        train_new_state = np.append(train_new_state, action)

        train_new_state = np.expand_dims(train_new_state, axis=0)

        if sum(state[0]) >= 42:
            counter += 1
            if counter % 40 == 0:
                replay.append((train_state, action, reward, train_new_state))
                if counter > 1000000000:
                    counter = 0
        else:
            replay.append((train_state, action, reward, train_new_state))

        lastState = np.copy(state)
        state = np.copy(new_state)
        # Experience replay storage.
        last_action = action
        # If we're done observing, start training.
        if t > observe:

            # If we've stored enough in our buffer, pop the oldest.
            if len(replay) > buffer:
                replay.pop(0)

            # Randomly sample our experience replay memory
            minibatch = random.sample(replay, batchSize)

            # Get training values.batchSize
            X_train, y_train = process_minibatch(minibatch, model)

            # Train the model on this batch.
            history = LossHistory()
            batchSize1 = len(X_train)
            model.fit(X_train,
                      y_train,
                      batch_size=batchSize1,
                      nb_epoch=1,
                      verbose=0,
                      callbacks=[history])
            loss_log.append(history.losses)

        # Update the starting state with S'.

        # Decrement epsilon over time.
        if epsilon > 0.1 and t > observe:
            epsilon -= 5 * (1 / train_frames)

        # We died, so update stuff.
        lastreward = reward
        if reward == -500:
            # Log the car's distance at this T.
            data_collect.append([t, car_distance])

            # Update max.
            if car_distance > max_car_distance:
                max_car_distance = car_distance

            # Time it.
            tot_time = timeit.default_timer() - start_time
            fps = car_distance / tot_time

            # Output some stuff so we can watch.
            print("Max: %d at %d\tepsilon %f\t(%d)\t%f fps" %
                  (max_car_distance, t, epsilon, car_distance, fps))

            # Reset.
            car_distance = 0
            start_time = timeit.default_timer()

        # Save the model every 25,000 frames.
        if t % 10000 == 0:
            model.save_weights('saved-models/' + filename + '-' + str(t) +
                               '.h5',
                               overwrite=True)
            print("Saving model %s - %d" % (filename, t))

    # Log results after we're done all frames.
    log_results(filename, data_collect, loss_log)
Exemple #15
0
def train_net(model, params):

    filename = params_to_filename(params)

    observe = 1000
    epsilon = 1
    train_frames = 100000
    batchSize = params['batchSize']
    buffer = params['buffer']
    max_car_distance = 0
    car_distance = 0
    t = 0
    data_collect = []
    replay = []
    loss_log = []
    game_state = UI.GameState()
    _, state = game_state.frame_step((2))
    start_time = timeit.default_timer()
    while t < train_frames:
        t += 1
        car_distance += 1
        if random.random() < epsilon or t < observe:
            action = np.random.randint(0, 3)
        else:
            qval = model.predict(state, batch_size=1)
            action = (np.argmax(qval))

        reward, new_state = game_state.frame_step(action)
        replay.append((state, action, reward, new_state))
        if t > observe:
            if len(replay) > buffer:
                replay.pop(0)

            # Randomly sample our experience replay memory
            minibatch = random.sample(replay, batchSize)

            X_train, y_train = process_minibatch2(minibatch, model)

            history = LossHistory()
            model.fit(X_train,
                      y_train,
                      batch_size=batchSize,
                      nb_epoch=1,
                      verbose=0,
                      callbacks=[history])
            loss_log.append(history.losses)

        state = new_state

        if epsilon > 0.1 and t > observe:
            epsilon -= (1.0 / train_frames)

        if reward == -500:
            data_collect.append([t, car_distance])

            if car_distance > max_car_distance:
                max_car_distance = car_distance

            tot_time = timeit.default_timer() - start_time
            fps = car_distance / tot_time

            print("Max: %d at %d\tepsilon %f\t(%d)\t%f fps" %
                  (max_car_distance, t, epsilon, car_distance, fps))

            car_distance = 0
            start_time = timeit.default_timer()

        # Save the model every 25,000 frames.
        if t % 25000 == 0:
            model.save_weights('saved-models/' + filename + '-' + str(t) +
                               '.h5',
                               overwrite=True)
            print("Saving model %s - %d" % (filename, t))

    log_results(filename, data_collect, loss_log)
Exemple #16
0
def train_net(model, params):

    filename = params_to_filename(params)

    observe = 1000  # Number of frames to observe before training.
    epsilon = 1
    train_frames = 110000  # Number of frames to play.
    steps = 0
    batchSize = params['batchSize']
    buffer = params['buffer']

    # Just stuff used below.
    max_car_distance = 0
    car_distance = 0
    t = 0
    data_collect = []
    replay = []  # stores tuples of (S, A, R, S').

    loss_log = []

    # Create a new game instance.
    game_state = carmunk.GameState()

    # Get initial state by doing nothing and getting the state.
    _, state = game_state.frame_step((1))

    # Let's time it.
    start_time = timeit.default_timer()

    # Run the frames.
    while t < train_frames:

        t += 1
        car_distance += 1

        # Choose an action.
        if random.random() < epsilon or t < observe:
            action = np.random.randint(3)  # random
        else:
            # Get Q values for each action.
            qval = model.predict(state, batch_size=1)
            action = (np.argmax(qval))  # best

        # Take action, observe new state and get our treat.
        reward, new_state = game_state.frame_step(action)

        # Experience replay storage.
        replay.append((state, action, reward, new_state))

        # If we're done observing, start training.
        if t > observe:
            #print("start")
            # If we've stored enough in our buffer, pop the oldest.
            if len(replay) > buffer:
                replay.pop(0)

            # Randomly sample our experience replay memory
            minibatch = random.sample(replay, batchSize)

            # Get training values.
            X_train, y_train = process_minibatch2(minibatch, model)

            # Train the model on this batch.
            history = LossHistory()
            model.fit(X_train,
                      y_train,
                      batch_size=batchSize,
                      nb_epoch=1,
                      verbose=0,
                      callbacks=[history])
            loss_log.append(history.losses)
            steps += 1
            if steps % 1000 == 0:
                print("Step = " + str(steps), "Epsilon = " + str(epsilon))
        # Update the starting state with S'.
        state = new_state

        # Decrement epsilon over time.
        if epsilon > 0.1 and t > observe:
            epsilon -= (1.0 / train_frames)

        # We died, so update stuff.
        if reward <= -500:
            #print("Crashed.")
            # Log the car's distance at this T.
            data_collect.append([t, car_distance])

            # Reset.
            car_distance = 0
        # We reached the goal, so update stuff.
        elif reward >= 2000:
            print("Reached goal.")
            # Log the car's distance at this T.
            data_collect.append([t, car_distance])

            # Reset.
            car_distance = 0

        # Save the model every 25,000 frames.
        if t % 25000 == 0:
            model.save_weights('saved-models/' + filename + '-' + str(t) +
                               '.h5',
                               overwrite=True)
            print("Saving model %s - %d" % (filename, t))