def train_models(X_train, y_train, batchSize, model, loss_log): history = LossHistory() model.fit(X_train, y_train, batch_size=batchSize, nb_epoch=1, verbose=0, callbacks=[history]) loss_log.append(history.losses) return loss_log
def train_net(model, params): filename = params_to_filename(params) observe = 1000 # Number of frames to observe before training. epsilon = 1 train_frames = 100000 # Number of frames to play. batchSize = params['batchSize'] buffer = params['buffer'] # Just stuff used below. max_car_distance = 0 car_distance = 0 min_distance = 10000 t = 0 data_collect = [] replay = [] # stores tuples of (S, A, R, S'). loss_log = [] # Create a new game instance. game_state = carmunk.GameState() # Get initial state by doing nothing and getting the state. _, state,_= game_state.frame_step((2)) # Let's time it. start_time = timeit.default_timer() # Run the frames. while t < train_frames: t += 1 car_distance += 1 # Choose an action. # if random.random() < epsilon or t < observe: # action = np.random.randint(0, 3) # random # else: # Get Q values for each action. qval = model.predict(state, batch_size=1) action = (np.argmax(qval)) # best # Take action, observe new state and get our treat. reward, new_state, distance = game_state.frame_step(action) # Experience replay storage. replay.append((state, action, reward, new_state)) # If we're done observing, start training. if t > observe: # If we've stored enough in our buffer, pop the oldest. if len(replay) > buffer: replay.pop(0) # Randomly sample our experience replay memory minibatch = random.sample(replay, batchSize) # mot liss gom 64 tuble, moi tuble gom 4 phan tu (S, A, R, S') # Get training values. X_train, y_train = process_minibatch2(minibatch, model) # Train the model on this batch. history = LossHistory() model.fit( X_train, y_train, batch_size=batchSize, nb_epoch=1, verbose=0, callbacks=[history] ) loss_log.append(history.losses) # Update the starting state with S'. state = new_state # Decrement epsilon over time. if epsilon > 0.1 and t > observe: epsilon -= (1.0/train_frames) if distance < min_distance: min_distance = distance # We died, so update stuff. if reward == -500: # Log the car's distance at this T. data_collect.append([t, car_distance]) # Update max. if car_distance > max_car_distance: max_car_distance = car_distance # Time it. tot_time = timeit.default_timer() - start_time fps = car_distance / tot_time # Output some stuff so we can watch. # print("Max_car_distance: %d at %d\tepsilon %f\t(%d)\tdistance %d\t%f fps" % # (max_car_distance, t, epsilon, car_distance, distance, fps)) # Reset. car_distance = 0 start_time = timeit.default_timer() if t % 10 == 0: print("Max_car_distance: %d at %d\tepsilon %f\t(%d)\tdistance %d \tmin_distance %d" % (max_car_distance, t, epsilon, car_distance, distance, min_distance)) # Save the model every 25,000 frames. if t % 10000 == 0: model.save_weights('saved-models/' + filename + '-' + str(t) + '.h5', overwrite=True) print("Saving model %s - %d" % (filename, t)) # Log results after we're done all frames. log_results(filename, data_collect, loss_log)
reward = np.dot(train,weightReadings) reward = reward.astype(int) if trainCount > observe: # If we've stored enough in our buffer, pop the oldest. if len(replay) > buffer: replay.pop(0) # Randomly sample our experience replay memory minibatch = random.sample(replay, batchSize) # Get training values by Sarsa 0 X_train, y_train = tv.sarsa0_minibatch(minibatch, model, sarsa0P) # Train the model on this batch. history = LossHistory() model.fit( X_train, y_train, batch_size=batchSize, nb_epoch=1, verbose=0, callbacks=[history] ) loss_log.append(history.losses) state = new_state if epsilon > final_epsilon and trainCount > observe: epsilon -= (1/train_frames) print (epsilon) # Save the model every 25,000 frames. filename = 'train5'
def update_replay(self, reward, new_state, action=None): if action is None: action = self.lastAction # Experience replay storage. self.replay.append( (np.copy(self.old_state), action, reward, np.copy(new_state))) # If we're done observing, start training. if self.t > self.observe: # If we've stored enough in our buffer, pop the oldest. if len(self.replay) > self.buffer: self.replay.pop(0) # Randomly sample our experience replay memory minibatch = random.sample(self.replay, self.batchSize) # Get training values. X_train, y_train = process_minibatch2(minibatch, self.model, self.sequence_length, self.end_value, self.GAMMA) # Train the model on this batch. history = LossHistory() self.model.fit(X_train, y_train, batch_size=self.batchSize, nb_epoch=1, verbose=0, callbacks=[history]) self.loss_log.append(history.losses) if self.t % self.save_every == 0: if len(self.data_collect) > 50: # Save the results to a file so we can graph it later. learn_f = 'results/command-frames/learn_data-' + self.filename + '.csv' with open(learn_f, 'w', newline='') as data_dump: wr = csv.writer(data_dump) wr.writerows(self.data_collect) plotting.plot_file(learn_f, 'learn') if len(self.loss_log) > 500: loss_f = 'results/command-frames/loss_data-' + self.filename + '.csv' with open(loss_f, 'w', newline='') as lf: wr = csv.writer(lf) for loss_item in self.loss_log: wr.writerow(loss_item) plotting.plot_file(loss_f, 'loss') # Update the starting state with S'. self.state = new_state # Decrement epsilon over time. if self.epsilon > 0.1 and self.t > self.observe: self.epsilon -= (1.0 / self.train_frames) # We died, so update stuff. if reward == -500: # Log the car's distance at this T. print([self.t, self.hacker_cmds]) self.data_collect.append([self.t, self.hacker_cmds]) # Update max. if self.hacker_cmds > self.max_hacker_cmds: self.max_hacker_cmds = self.hacker_cmds # Time it. tot_time = timeit.default_timer() - self.start_time fps = self.hacker_cmds / tot_time # Output some stuff so we can watch. print("Max: %d at %d\tepsilon %f\t(%d)\t%f fps" % (self.max_hacker_cmds, self.t, self.epsilon, self.hacker_cmds, fps)) # Reset. self.hacker_cmds = 0 start_time = timeit.default_timer() # Save the model every 25,000 frames. if self.t % self.save_every == 0: pickle._dump( self.replay, open(self.save_replay_file_prefix + "-" + str(self.t), "wb")) model_save_filename = self.save_model_file_prefix + self.filename + '-' + str( self.t) + '.h5' self.model.save_weights(model_save_filename, overwrite=True) print("Saving model %s - %d" % (self.filename, self.t))
def train(model, params): filename = params_to_filename(params) EPISODE = 10 FRAMES = 4000 OBSERVE = FRAMES * 3 epsilon = 1 batchSize = params['batchSize'] buffer = params['buffer'] replay = [] minibatch = [] total_frames = 0 path_log = [] loss_log = [] # min_path_length = 0 for m in range(EPISODE): print("Episode: %d" % (m)) gameObject = GameClass(draw_screen=True, display_path=True, fps=FPS) # Choose no action in the initial frame action = 2 reward, state = gameObject.frame_step(action) for t in range(FRAMES): total_frames += 1 if t % (FRAMES / 10) == 0: print("Frames: %d" % (t)) # Choose the action based on the epsilon greedy algorithm if (random.random() < epsilon or total_frames < OBSERVE): # choose random action action = np.random.randint(0, 3) else: # choose best action from Q(s,a) values # Let's run our Q function on (state,action) to get Q values for all possible actions Q = np.zeros(3) for a in range(3): features = get_features(state, a) Q[a] = model.predict(features, batch_size=batchSize) action = (np.argmax(Q)) # Execute the action, observe new state and reward reward, state_new = gameObject.frame_step(action) path_length = gameObject.num_steps # Store the (state, action, reward, new state) pair in the replay memory = state, action, reward, state_new replay.append(memory) # If we've stored enough in our buffer, pop the oldest. if len(replay) > buffer: replay.pop(0) # Randomly sample our experience replay memory if we have enough samples if total_frames > OBSERVE: minibatch = random.sample(replay, batchSize) # Process the minibatch to get the training data X_train, y_train = process_minibatch(minibatch, model, batchSize) # Train the model on this batch. history = LossHistory() model.fit(X_train, y_train, batch_size=batchSize, verbose=0, callbacks=[history]) loss_log.append(history.losses) # Decrement epsilon over time. if epsilon > 0.1: epsilon -= 1.0 / (FRAMES * EPISODE - OBSERVE) # Update the starting state with S'. state = state_new # Stop this episode if we achieved the goal if gameObject.check_reach_goal(): # Log the robot's path length path_log.append([m, path_length]) # # Update the min # if path_length < min_path_length: # min_path_length = path_length # # Output some stuff so we can watch. # print("Min: %d \t epsilon %f\t(%d)" % # (min_path_length, epsilon, path_length)) # Stop this episode break # Save the model every episode after observation. if total_frames > OBSERVE: model.save('saved-models/model_nn-' + filename + '-' + str(m) + '.h5', overwrite=True) print("Saving model %s - %d" % (filename, m)) # Log results after we're done all episodes. log_results(filename, path_log, loss_log, m)
def train_net(model, params, mode='grid'): observe = 1000 # Number of frames to observe before training. epsilon = 1 train_frames = 10000 # Number of frames to play. train_frames = TRAIN_FRAMES filename = params_to_filename(params, mode, train_frames) print(filename) if mode == 'lane_following': rate = 10 # Hz screen = pygame.display.set_mode((1300, 600)) pygame.display.set_caption("mdeyo car sim") background = pygame.Surface(screen.get_size()) background.fill((0, 0, 0)) RED = (255, 0, 0) car = Car2(RED, 60, 385, screen, 100) road = CurvedRoad(1200, 60, 385, '45') car.constant_speed = True state = road.getState(car) print('state:', state) if mode == 'grid': # Create a new game instance. # game_state = carmunk.GameState() grid = Grid(X_DIM, Y_DIM) car = Car(grid, 0, 0) game_state = World(grid, car, 500, 10, False) # Get initial state by doing nothing and getting the state. _, state = game_state.updateState(0) batchSize = params['batchSize'] buffer = params['buffer'] # Just stuff used below. max_car_reward = -999999 car_reward = 0 t = 0 data_collect = [] replay = [] # stores tuples of (S, A, R, S') loss_log = [] # Let's time it. start_time = timeit.default_timer() # Run the frames. while t < train_frames: t += 1 if mode == 'grid': # Choose an action. if random.random() < epsilon or t < observe: action = np.random.randint(0, 3) # random else: # Get Q values for each action. qval = model.predict(state, batch_size=1) action = (np.argmax(qval)) # best # Take action, observe new state and get our treat. #reward, new_state = game_state.frame_step(action) car_reward, new_state = game_state.updateState(action) # car_reward = reward # print(reward) elif mode == 'lane_following': # Choose an action. if random.random() < epsilon or t < observe: action = np.random.randint(0, 3) # random # actions currently are 0 = no input (drive straight) # 1 = left turn input # 2 = right turn input else: # Get Q values for each action. qval = model.predict(state, batch_size=1) action = (np.argmax(qval)) # best # Take action, observe new state and get our treat. # print(action) car.takeAction(action) car.update(1 / rate) road.plotRoad(screen) new_state = road.getState(car) (car_reward, done) = road.reward(car) # --- Go ahead and update the screen with what we've drawn. pygame.display.flip() # --- Limit to 60 frames per second # clock.tick(rate) # print(car_reward) # Experience replay storage. print(t, 'reward', car_reward) # print('state:', state, 'action', action, 'reward', # car_reward, 'new_state', new_state) replay.append((state, action, car_reward, new_state)) # If we're done observing, start training. if t > observe: # If we've stored enough in our buffer, pop the oldest. if len(replay) > buffer: replay.pop(0) # Randomly sample our experience replay memory minibatch = random.sample(replay, batchSize) # Get training values. X_train, y_train = process_minibatch(minibatch, model) # Train the model on this batch. history = LossHistory() model.fit(X_train, y_train, batch_size=batchSize, epochs=1, verbose=0, callbacks=[history]) loss_log.append(history.losses) # Update the starting state with S'. state = new_state # print(state) # game_state.grid.printGrid() # print(reward) # Decrement epsilon over time. if epsilon > 0.1 and t > observe: epsilon -= (1 / train_frames) # We died, so update stuff. if done == 1: # if reward > 0 or reward==-999: # Log the car's distance at this T. data_collect.append([t, car_reward]) # Update max. if car_reward > max_car_reward: max_car_reward = car_reward # Time it. tot_time = timeit.default_timer() - start_time # fps = car_distance / tot_time # Output some stuff so we can watch. print("Max: %d at %d\tepsilon %f\t(%d)\t" % (max_car_reward, t, epsilon, car_reward)) # Reset. car_reward = 0 start_time = timeit.default_timer() # Save the model every 25,000 frames. if t % 100 == 0: print(t) if t % 2000 == 0: model.save_weights('saved-models/' + filename + '-' + str(t) + '.h5', overwrite=True) print("Saving model %s - %d" % (filename, t)) # Log results after we're done all frames. print(train_frames) log_results(filename, data_collect, loss_log, train_frames, observe)
def train_net(model, params): filename = params_to_filename(params) observe = 129 # Number of frames to observe before training. epsilon = 0.5 train_frames = 50000 # Number of frames to play. steps = 0 batchSize = params['batchSize'] buffer = params['buffer'] # Just stuff used below. max_car_distance = 0 car_distance = 0 t = 0 data_collect = [] replay = [] # stores tuples of (S, A, R, S'). #to be displayed loss_log = [] # Create a new game instance. game_state = carmunk.GameState() # Get initial state by doing nothing and getting the state. _, state, _ = game_state.frame_step((2)) # Let's time it. start_time = timeit.default_timer() # Run the frames. while t < train_frames: print(t) t += 1 car_distance += 1 # Choose an action. if random.random() < epsilon or t < observe: action = np.random.randint(0, 5) # random else: # Get Q values for each action. print("PREDICTED", state) # time.sleep(1) x = state[0] y = state[1] qval = model.predict(np.array([x, y]).reshape((1, 2)), batch_size=1) action = (np.argmax(qval)) # best # Take action, observe new state and get our treat. reward, new_state, term = game_state.frame_step(action) print("timestep :" + str(t) + "Reward" + str(reward) + "action" + str(action) + "state" + str(state)) # Experience replay storage. replay.append((state, action, reward, new_state)) # print(len(replay)) # If we're done observing, start training. if t > observe: #print("start") # If we've stored enough in our buffer, pop the oldest. if len(replay) > buffer: replay.pop(0) # Randomly sample our experience replay memory minibatch = random.sample(replay, batchSize) # Get training values. X_train, y_train = process_minibatch2(minibatch, model) # Train the model on this batch. history = LossHistory() model.fit(X_train, y_train, batch_size=batchSize, verbose=0, callbacks=[history]) loss_log.append(history.losses) steps += 1 if steps % 1000 == 0: print("Step = " + str(steps), "Epsilon = " + str(epsilon)) # Update the starting state with S'. state = new_state # Decrement epsilon over time. if epsilon > 0.1 and t > observe: epsilon -= (10.0 / train_frames) print("EPSILON UPDATED", epsilon) # We died, so update stuff. if term == 1: # print("Crashed.") # Log the car's distance at this T. data_collect.append([t, car_distance]) continue # Reset. car_distance = 0 # We reached the goal, so update stuff. elif term == 2: print("Reached goal.", car_distance) # Log the car's distance at this T. data_collect.append([t, car_distance]) continue # Reset. car_distance = 0 # Save the model every 25,000 frames. if t % 25000 == 0: model.save_weights('saved-models/' + filename + '-' + str(t) + '.h5', overwrite=True) print("Saving model %s - %d" % (filename, t)) # if(keyboard.is_pressed('8')): # print("Reset Goal") # game_state.reset_goal() print(t, reward, action)
def train_net(model, params): filename = params_to_filename(params) observe = 1000 # Number of frames to observe before training. epsilon = 1 train_frames = 1002 # Number of frames to play. reward = 0 death = 0 printstuff = '' batchSize = params['batchSize'] buffer = params['buffer'] # Just stuff used below. max_car_distance = 0 car_distance = 0 max_reward = 0 t = 0 data_collect = [] replay = [] # stores tuples of (S, A, R, S'). loss_log = [] # Create a new game instance. game_state = carmunkStatic.GameState() # Get initial state by doing nothing and getting the state. _, state, nothing = game_state.frame_step((2)) # Let's time it. start_time = timeit.default_timer() # Run the frames. while t < train_frames: t += 1 car_distance += 1 # Choose an action. if random.random() < epsilon or t < observe: action = np.random.randint(0, 4) # random 0-1-2 else: # Get Q values for each action. qval = model.predict(state, batch_size=1) action = (np.argmax(qval)) # best # Take action, observe new state and get our treat. reward, new_state, printstuff = game_state.frame_step(action) # Experience replay storage. replay.append((state, action, reward, new_state)) # If we're done observing, start training. if t > observe: # If we've stored enough in our buffer, pop the oldest. if len(replay) > buffer: replay.pop(0) # Randomly sample our experience replay memory minibatch = random.sample(replay, batchSize) # Get training values. X_train, y_train = process_minibatch(minibatch, model) # Train the model on this batch. history = LossHistory() model.fit(X_train, y_train, batch_size=batchSize, nb_epoch=1, verbose=0, callbacks=[history]) loss_log.append(history.losses) # Update the starting state with S'. state = new_state # Decrement epsilon over time. if epsilon > 0.1 and t > observe: epsilon -= (1 / train_frames) #Update max if reward > max_reward: max_reward = reward # We died, so update stuff. if reward == -500: # Log the car's distance at this T. data_collect.append([t, car_distance]) # Update max. if car_distance > max_car_distance: max_car_distance = car_distance # Time it. tot_time = timeit.default_timer() - start_time fps = car_distance / tot_time # Output some stuff so we can watch. print("Max: %d at %d\tepsilon %f\t(%d)\t%f fps" % (max_car_distance, t, epsilon, car_distance, fps)) print("Max reward : %d", max_reward) # Reset. car_distance = 0 start_time = timeit.default_timer() #update death death += 1 if t > observe & death > 10: return print(printstuff)
def train_net(best_action_model, params): filename = params_to_filename(params) observe = 1000 # Number of frames to observe before training. epsilon = 1 train_frames = 500000 # Number of frames to play. was 1000000 batchSize = params['batchSize'] buffer = params['buffer'] # Just stuff used below. max_car_distance = 0 car_distance = 0 t = 0 cum_rwd = 0 cum_rwd_read = 0 cum_rwd_dist = 0 cum_rwd_speed = 0 data_collect = [] replay = [] # stores tuples of (S, A, R, S'). save_init = True loss_log = [] # Create a new game instance. game_state = carmunk.GameState() # Get initial state by doing nothing and getting the state. state, new_reward, cur_speed, _, _, _ = game_state.frame_step( START_ACTION, START_SPEED, START_DISTANCE) # frame_step returns reward, state, speed #state = state_frames(state, np.array([[0, 0, 0, 0, 0, 0, 0]])) # zeroing distance readings #state = state_frames(state, np.zeros((1,NUM_SENSORS))) # zeroing distance readings # Let's time it. start_time = timeit.default_timer() # Run the frames. while t < train_frames: #time.sleep(0.5) t += 1 car_distance += 1 # Choose an action. if random.random() < epsilon or t < observe: action = np.random.randint(0, NUM_OUTPUT) # random else: # Get Q values for each action qval = best_action_model.predict(state, batch_size=1) # best_action_model was passed to this function. call it w/ current state action = (np.argmax(qval)) # best prediction # Take action, observe new state and get our treat. new_state, new_reward, new_speed, new_rwd_read, new_rwd_dist, new_rwd_speed = \ game_state.frame_step(action, cur_speed, car_distance) # Use multiple frames. #new_state = state_frames(new_state, state) # seems this is appending 2-3 moves, results # Experience replay storage. replay.append((state, action, new_reward, new_state)) # If we're done observing, start training. if t > observe: # If we've stored enough in our buffer, pop the oldest. if len(replay) > buffer: replay.pop(0) # Randomly sample our experience replay memory minibatch = random.sample(replay, batchSize) # WHY RANDOM SAMPLE? COULD TRAINING BE SPED UP BY TAKING LAST BATCHSIZE # Get training values. X_train, y_train = process_minibatch(minibatch, best_action_model) # Train the best_action_model on this batch. history = LossHistory() best_action_model.fit(X_train, y_train, batch_size=batchSize, nb_epoch=1, verbose=0, callbacks=[history]) loss_log.append(history.losses) # Update the starting state with S'. state = new_state cur_speed = new_speed cum_rwd += new_reward cum_rwd_read += new_rwd_read cum_rwd_dist += new_rwd_dist cum_rwd_speed += new_rwd_speed # Decrement epsilon over time. if epsilon > 0.1 and t > observe: epsilon -= (1 / train_frames) # We died, so update stuff. if new_reward == -500 or new_reward == -1000: # Log the car's distance at this T. data_collect.append([t, car_distance]) # Update max. if car_distance > max_car_distance: max_car_distance = car_distance # Time it. tot_time = timeit.default_timer() - start_time fps = car_distance / tot_time # Output some stuff so we can watch. print("Max: %d at %d\t eps: %f\t dist: %d\t rwd: %d\t read: %d\t dist: %d\t speed: %d\t fps: %d" % (max_car_distance, t, epsilon, car_distance, cum_rwd, \ cum_rwd_read, cum_rwd_dist, cum_rwd_speed, int(fps))) # Reset. car_distance = 0 cum_rwd = 0 cum_rwd_read = 0 cum_rwd_dist = 0 cum_rwd_speed = 0 start_time = timeit.default_timer() # Save early best_action_model, then every 20,000 frames if t % 50000 == 0: save_init = False best_action_model.save_weights('saved-best_action_models/' + filename + '-' + str(t) + '.h5', overwrite=True) print("Saving best_action_model %s - %d" % (filename, t)) # Log results after we're done all frames. log_results(filename, data_collect, loss_log)
def train_net(model, params): filename = params_to_filename(params) observe = 1000 # Number of frames to observe before training. epsilon = 1 train_frames = 300000 # Number of frames to play. batchSize = params['batchSize'] buffer = params['buffer'] # Just stuff used below. max_car_distance = 0 car_distance = 0 #needed to print information global max_reward global stuff global b_state global max_qVal frame = 0 t = 0 data_collect = [] replay = [] # stores tuples of (S, A, R, S'). loss_log = [] # Create a new game instance. game_state = carmunk.GameState() # Get initial state by doing nothing and getting the state. _, state, stuff = game_state.frame_step((2)) # Let's time it. start_time = timeit.default_timer() # Run the frames. while t < train_frames: t += 1 frame += 1 car_distance += 1 # Choose an action. if random.random() < epsilon or t < observe: action = np.random.randint(0, 4) # random 0-1-2-3 else: # Get Q values for each action. qval = model.predict(state, batch_size=1) action = (np.argmax(qval)) # best # Take action, observe new state and get our treat. reward, new_state, somestuff = game_state.frame_step(action) if reward > max_reward: stuff = somestuff # Experience replay storage. replay.append((state, action, reward, new_state)) # If we're done observing, start training. if t > observe: # If we've stored enough in our buffer, pop the oldest. if len(replay) > buffer: replay.pop(0) # Randomly sample our experience replay memory minibatch = random.sample(replay, batchSize) # Get training values. X_train, y_train = process_minibatch(minibatch, model) # Train the model on this batch. history = LossHistory() model.fit(X_train, y_train, batch_size=batchSize, nb_epoch=1, verbose=0, callbacks=[history]) loss_log.append(history.losses) # Update the starting state with S'. state = new_state # Decrement epsilon over time. if epsilon > 0.1 and t > observe: epsilon -= (1 / train_frames) # We died, so update stuff. if reward == -500: # Log the car's distance at this T. data_collect.append([t, car_distance]) # Update max. if car_distance > max_car_distance: max_car_distance = car_distance # Time it. tot_time = timeit.default_timer() - start_time fps = car_distance / tot_time # Output some stuff so we can watch. print("\n\nMax distance: %d at %d\nepsilon %f\n(%d)\n%f fps" % (max_car_distance, t, epsilon, car_distance, fps)) print("\n Max reward : %d\t,\n max qVal : %d\t" % (max_reward, max_qVal)) print('best state', b_state) print(stuff) print("\n frame:", frame) # Reset. max_reward = 0 stuff = '' car_distance = 0 max_qVal = 0 b_state = [0, 0, 0, 0, 0, 0, 0, 0] start_time = timeit.default_timer() # Save the model every 25,000 frames. if t % 25000 == 0: model.save_weights('saved-models/BLE/final/' + 'FINAL' + filename + '-' + str(t) + '.h5', overwrite=True) print("Saving model %s - %d" % (filename, t)) # Log results after we're done all frames. log_results(filename, data_collect, loss_log)
def train_net(model, params): filename = params_to_filename(params) train_frames = 300000 # Number of frames to play. batchSize = params['batchSize'] buffer = params['buffer'] # Just stuff used below. t = 0 replay = [] # stores tuples of (S, A, R, S'). loss_log = [] # Create a new game instance. game_state = flappy.Game() game_state.init_elements() # Get initial state by doing nothing and getting the state. state, _ = game_state.frame_step(0) # Run the frames. while t < train_frames: t += 1 # Choose an action. qval = model.predict(np.array([state]))[0] action = (np.argmax(qval)) # best if t % 500 == 0: print(qval) # Take action, observe new state and get our treat. new_state, reward = game_state.frame_step(action) if t % 1000 == 0: print(t, action, state, reward) # Experience replay storage. replay.append((state, action, reward, new_state)) # If we're done observing, start training. if t > batchSize: # If we've stored enough in our buffer, pop the oldest. if len(replay) > buffer: replay.pop(0) # Randomly sample our experience replay memory minibatch = random.sample(replay, batchSize) # Get training values. X_train, y_train = process_minibatch(minibatch, model) # Train the model on this batch. history = LossHistory() model.fit(X_train, y_train, batch_size=batchSize, nb_epoch=1, verbose=0, callbacks=[history]) loss_log.append(history.losses) # Update the starting state with S'. state = new_state if reward == -1000: game_state.init_elements() state, _ = game_state.frame_step(0) # Save the model every 2500 frames. if t % 25000 == 0: model.save_weights('results/saved-models/' + filename + '-' + str(t) + '.h5', overwrite=True) print("Saving model %s - %d" % (filename, t)) if t % 50000 == 0: # Log results after we're done all frames. log_results(filename, loss_log)
def train(self,state,simulator): self.t+=1 if random.random()<self.epsilon or self.t<self.observe: action = np.random.randint(0, 4) else: # Get Q values for each action. qval = self.model.predict(state, batch_size=1) action = (np.argmax(qval)) # Take action, observe new state and get our treat. simulator.applyAction(action) reward, new_state = simulator.statusVector() # Experience replay storage. self.replay.append((state, action, reward, new_state)) if t > observe: # If we've stored enough in our buffer, pop the oldest. if len(self.replay) > buffer: self.replay.pop(0) # Randomly sample our experience replay memory minibatch = random.sample(self.replay, self.batchSize) # Get training values. X_train, y_train = process_minibatch2(minibatch, self.model) # Train the model on this batch. history = LossHistory() model.fit( X_train, y_train, batch_size=batchSize, nb_epoch=1, verbose=0, callbacks=[history] ) # Decrement epsilon over time. if self.epsilon > 0.1 and self.t > self.observe: self.epsilon -= (1.0/train_frames) if self.t % 25000 == 0: self.model.save_weights('saved-models/' + self._filename + '-' + str(self.t) + '.h5', overwrite=True) print("Saving model %s - %d" % (self._filename, self.t)) '''TODO need to change to class functions''' def process_minibatch2(minibatch, model): # by Microos, improve this batch processing function # and gain 50~60x faster speed (tested on GTX 1080) # significantly increase the training FPS # instead of feeding data to the model one by one, # feed the whole batch is much more efficient mb_len = len(minibatch) old_states = np.zeros(shape=(mb_len, 5)) actions = np.zeros(shape=(mb_len,)) rewards = np.zeros(shape=(mb_len,)) new_states = np.zeros(shape=(mb_len, 5)) for i, m in enumerate(minibatch): old_state_m, action_m, reward_m, new_state_m = m old_states[i, :] = old_state_m[...] actions[i] = action_m rewards[i] = reward_m new_states[i, :] = new_state_m[...] old_qvals = model.predict(old_states, batch_size=mb_len) new_qvals = model.predict(new_states, batch_size=mb_len) maxQs = np.max(new_qvals, axis=1) y = old_qvals non_term_inds = np.where(rewards != -500)[0] term_inds = np.where(rewards == -500)[0] y[non_term_inds, actions[non_term_inds].astype(int)] = rewards[non_term_inds] + (GAMMA * maxQs[non_term_inds]) y[term_inds, actions[term_inds].astype(int)] = rewards[term_inds] X_train = old_states y_train = y return X_train, y_train
def train_net(turn_model, turn_model_30, turn_model_50, turn_model_70, avoid_model, acquire_model, acquire_model_30, acquire_model_50, acquire_model_70, hunt_model, pack_model, params): filename = params_to_filename(params) if cur_mode in [TURN, HUNT, PACK]: observe = 2000 # Number of frames to observe before training. else: observe = 2000 epsilon = 1 # vary this based on pre-learning already occurred in lower models train_frames = 750000 # number of flips for training batchSize = params['batchSize'] buffer = params['buffer'] # initialize variables and structures used below. max_crash_frame_ctr = 0 crash_frame_ctr = 0 total_frame_ctr = 0 replay_frame_ctr = 0 stop_ctr = 0 avoid_ctr = 0 acquire_ctr = 0 cum_rwd = 0 cum_speed = 0 data_collect = [] replay = [] loss_log = [] # replay stores state, action, reward, new state save_init = True cur_speeds = [] for i in range(NUM_DRONES): cur_speeds.append(START_SPEED) # initialize drone state holders turn_states = np.zeros( [NUM_DRONES, TURN_TOTAL_SENSORS * TURN_STATE_FRAMES]) avoid_states = np.zeros( [NUM_DRONES, AVOID_TOTAL_SENSORS * AVOID_STATE_FRAMES]) acquire_states = np.zeros( [NUM_DRONES, ACQUIRE_NUM_SENSOR * ACQUIRE_STATE_FRAMES]) hunt_states = np.zeros( [NUM_DRONES, HUNT_TOTAL_SENSORS * HUNT_STATE_FRAMES]) drone_states = np.zeros( [NUM_DRONES, DRONE_TOTAL_SENSOR * PACK_STATE_FRAMES]) # create game instance game_state = carmunk.GameState() # get initial state(s) turn_state, avoid_state, acquire_state, hunt_state, drone_state, reward, cur_speed = \ game_state.frame_step(START_DRONE_ID, START_TURN_ACTION, START_SPEED_ACTION, START_PACK_ACTION, START_SPEED, START_DISTANCE, 1) # initialize frame states if cur_mode in [TURN, AVOID, HUNT, PACK]: for i in range(NUM_DRONES): turn_states[i] = state_frames( turn_state, np.zeros((1, TURN_TOTAL_SENSORS * TURN_STATE_FRAMES)), TURN_TOTAL_SENSORS, TURN_STATE_FRAMES) if cur_mode in [AVOID, HUNT, PACK]: for i in range(NUM_DRONES): avoid_states[i] = state_frames( avoid_state, np.zeros((1, AVOID_TOTAL_SENSORS * AVOID_STATE_FRAMES)), AVOID_TOTAL_SENSORS, AVOID_STATE_FRAMES) if cur_mode in [ACQUIRE, HUNT, PACK]: for i in range(NUM_DRONES): acquire_states[i] = state_frames( acquire_state, np.zeros((1, ACQUIRE_NUM_SENSOR * ACQUIRE_STATE_FRAMES)), ACQUIRE_NUM_SENSOR, ACQUIRE_STATE_FRAMES) if cur_mode in [HUNT, PACK]: for i in range(NUM_DRONES): hunt_states[i] = state_frames( hunt_state, np.zeros((1, HUNT_TOTAL_SENSORS * HUNT_STATE_FRAMES)), HUNT_TOTAL_SENSORS, HUNT_STATE_FRAMES) if cur_mode == PACK: for i in range(NUM_DRONES): drone_states[i] = state_frames( drone_state, np.zeros((1, DRONE_TOTAL_SENSOR * PACK_STATE_FRAMES)), DRONE_TOTAL_SENSOR, PACK_STATE_FRAMES) pack_state = state_frames( drone_state, np.zeros((1, PACK_TOTAL_SENSORS * PACK_STATE_FRAMES)), PACK_TOTAL_SENSORS, PACK_STATE_FRAMES) # time it start_time = timeit.default_timer() # run frames while total_frame_ctr < train_frames: total_frame_ctr += 1 # counts total training distance traveled crash_frame_ctr += 1 # counts distance between crashes replay_frame_ctr += 1 # counts frames between pack mode replay captures # used to slow things down for de-bugging #time.sleep(0.25) for drone_id in range( NUM_DRONES): # NUM_DRONES = 1, unless you're in PACK mode speed_action = START_SPEED_ACTION # choose appropriate action(s) # note: only generates random inputs for currently training model. # all prior (sub) models provide their best (fully-trained) inputs if random.random( ) < epsilon or total_frame_ctr < observe: # epsilon degrades over flips... if cur_mode == TURN: turn_action = set_turn_action( True, cur_speeds[drone_id], np.array([turn_states[drone_id]])) else: if cur_mode in [AVOID, HUNT, PACK]: turn_action, turn_model = set_turn_action( False, cur_speeds[drone_id], np.array([turn_states[drone_id]])) if cur_mode == AVOID: speed_action = set_avoid_action( True, turn_action, np.array([avoid_states[drone_id]])) else: if cur_mode in [HUNT, PACK]: speed_action = set_avoid_action( False, turn_action, np.array([avoid_states[drone_id]])) if cur_mode == ACQUIRE: acquire_action = set_acquire_action( True, cur_speeds[drone_id], np.array([acquire_states[drone_id, ]])) turn_action = acquire_action else: acquire_action, acquire_model = set_acquire_action( False, cur_speeds[drone_id], np.array([acquire_states[drone_id, ]])) if cur_mode == HUNT: hunt_action, turn_action, speed_action = set_hunt_action( True, cur_speeds[drone_id], turn_action, speed_action, acquire_action, np.array([hunt_states[drone_id, ]])) else: hunt_action, turn_action, speed_action = set_hunt_action( False, cur_speeds[drone_id], turn_action, speed_action, acquire_action, np.array([hunt_states[drone_id, ]])) if cur_mode == PACK and ( total_frame_ctr == 1 or (replay_frame_ctr - 1) % PACK_EVAL_FRAMES == 0) and drone_id == 0: pack_action = set_pack_action( True, pack_state) # note: pack action only changed every PACK_EVAL_FRAMES. # for frames in between it's constant else: # ...increasing use of predictions over time if cur_mode == TURN: turn_action, turn_model = set_turn_action( False, cur_speeds[drone_id], np.array([turn_states[drone_id]])) else: if cur_mode in [AVOID, HUNT, PACK]: turn_action, turn_model = set_turn_action( False, cur_speeds[drone_id], np.array([turn_states[drone_id]])) if cur_mode == AVOID: speed_action = set_avoid_action( False, turn_action, np.array([avoid_states[drone_id]])) else: if cur_mode in [HUNT, PACK]: speed_action = set_avoid_action( False, turn_action, np.array([avoid_states[drone_id]])) if cur_mode == ACQUIRE: acquire_action, acquire_model = set_acquire_action( False, cur_speeds[drone_id], np.array([acquire_states[drone_id, ]])) turn_action = acquire_action else: acquire_action, acquire_model = set_acquire_action( False, cur_speeds[drone_id], np.array([acquire_states[drone_id, ]])) if cur_mode == HUNT: hunt_action, turn_action, speed_action = set_hunt_action( False, cur_speeds[drone_id], turn_action, speed_action, acquire_action, np.array([hunt_states[drone_id, ]])) else: hunt_action, turn_action, speed_action = set_hunt_action( False, cur_speeds[drone_id], turn_action, speed_action, acquire_action, np.array([hunt_states[drone_id, ]])) if cur_mode == PACK and ( total_frame_ctr == 1 or (replay_frame_ctr - 1) % PACK_EVAL_FRAMES == 0) and drone_id == 0: # get 1 pack action for each set of drones on first drone pack_action = set_pack_action( False, pack_state) print(pack_action) #print("++++++ pack action:", pack_action) #print(2) # pass action, receive new state, reward new_turn_state, new_avoid_state, new_acquire_state, new_hunt_state, new_drone_state, new_reward, new_speed = game_state.frame_step( drone_id, turn_action, speed_action, pack_action, cur_speeds[drone_id], total_frame_ctr, replay_frame_ctr) #print("********** 2. new states / rewards:") #print(total_frame_ctr) #print(drone_id) #print(new_drone_state) #print(new_reward) #print(3) # append (horizontally) historical states for learning speed. """ note: do this concatination even for models that are not learning (e.g., turn when running search or turn, search and acquire while running hunt) b/c their preds, performed above, expect the same multi-frame view that was in place when they trained.""" if cur_mode in [TURN, AVOID, HUNT, PACK]: new_turn_state = state_frames( new_turn_state, np.array([turn_states[drone_id]]), TURN_TOTAL_SENSORS, TURN_STATE_FRAMES) if cur_mode in [AVOID, HUNT, PACK]: new_avoid_state = state_frames( new_avoid_state, np.array([avoid_states[drone_id]]), AVOID_TOTAL_SENSORS, AVOID_STATE_FRAMES) if cur_mode in [ACQUIRE, HUNT, PACK]: new_acquire_state = state_frames( new_acquire_state, np.array([acquire_states[drone_id]]), ACQUIRE_NUM_SENSOR, ACQUIRE_STATE_FRAMES) if cur_mode in [HUNT, PACK]: new_hunt_state = state_frames( new_hunt_state, np.array([hunt_states[drone_id]]), HUNT_TOTAL_SENSORS, HUNT_STATE_FRAMES) #print(4) if cur_mode == PACK and (total_frame_ctr == 1 or replay_frame_ctr % PACK_EVAL_FRAMES == 0): if drone_id == 0: # for 1st drone, pack state = drone state new_pack_state = new_drone_state pack_rwd = new_reward else: # otherwise, append drone record to prior drone state new_pack_state = state_frames(new_pack_state, new_drone_state, DRONE_TOTAL_SENSOR, 2) pack_rwd += new_reward new_drone_state = state_frames( new_drone_state, np.array([drone_states[drone_id]]), DRONE_TOTAL_SENSOR, PACK_STATE_FRAMES) if drone_id == (NUM_DRONES - 1): # for last drone build pack record if total_frame_ctr == 1: pack_state = np.zeros( (1, PACK_TOTAL_SENSORS * PACK_STATE_FRAMES)) new_pack_state = state_frames( new_pack_state, pack_state, PACK_TOTAL_SENSORS, PACK_STATE_FRAMES ) #may need to add 1 to PACK_STATE_FRAMES #print("**** 3. final pack reward:") #print(pack_rwd) #print(5) # experience replay storage """note: only the model being trained requires event storage as it is stack that will be sampled for training below.""" if cur_mode == TURN: replay.append((np.array([turn_states[drone_id]]), turn_action, new_reward, new_turn_state)) elif cur_mode == AVOID: replay.append((np.array([avoid_states[drone_id]]), speed_action, new_reward, new_avoid_state)) elif cur_mode == ACQUIRE: replay.append((np.array([acquire_states[drone_id]]), turn_action, new_reward, new_acquire_state)) elif cur_mode == HUNT: replay.append((np.array([hunt_states[drone_id]]), hunt_action, new_reward, new_hunt_state)) elif cur_mode == PACK and (total_frame_ctr == 1 or replay_frame_ctr % PACK_EVAL_FRAMES == 0) and drone_id == (NUM_DRONES - 1): replay.append( (pack_state, pack_action, pack_rwd, new_pack_state)) #print(replay[-1]) #print("6a") # If we're done observing, start training. if total_frame_ctr > observe and ( cur_mode != PACK or (replay_frame_ctr % PACK_EVAL_FRAMES == 0 and drone_id == (NUM_DRONES - 1))): # If we've stored enough in our buffer, pop the oldest. if len(replay) > buffer: replay.pop(0) # Randomly sample our experience replay memory minibatch = random.sample(replay, batchSize) if cur_mode == TURN: # Get training values. X_train, y_train = process_minibatch( minibatch, turn_model, TURN_NUM_INPUT, TURN_NUM_OUTPUT) history = LossHistory() turn_model.fit(X_train, y_train, batch_size=batchSize, nb_epoch=1, verbose=0, callbacks=[history]) elif cur_mode == AVOID: X_train, y_train = process_minibatch( minibatch, avoid_model, AVOID_NUM_INPUT, AVOID_NUM_OUTPUT) history = LossHistory() avoid_model.fit(X_train, y_train, batch_size=batchSize, nb_epoch=1, verbose=0, callbacks=[history]) elif cur_mode == ACQUIRE: X_train, y_train = process_minibatch( minibatch, acquire_model, ACQUIRE_NUM_INPUT, ACQUIRE_NUM_OUTPUT) history = LossHistory() acquire_model.fit(X_train, y_train, batch_size=batchSize, nb_epoch=1, verbose=0, callbacks=[history]) elif cur_mode == HUNT: X_train, y_train = process_minibatch( minibatch, hunt_model, HUNT_NUM_INPUT, HUNT_NUM_OUTPUT) history = LossHistory() hunt_model.fit(X_train, y_train, batch_size=batchSize, nb_epoch=1, verbose=0, callbacks=[history]) elif cur_mode == PACK: X_train, y_train = process_minibatch( minibatch, pack_model, PACK_NUM_INPUT, PACK_NUM_OUTPUT) history = LossHistory() pack_model.fit(X_train, y_train, batch_size=batchSize, nb_epoch=1, verbose=0, callbacks=[history]) loss_log.append(history.losses) # Update the starting state with S'. if cur_mode in [TURN, AVOID, HUNT, PACK]: turn_states[drone_id] = new_turn_state if cur_mode in [AVOID, HUNT, PACK]: avoid_states[drone_id] = new_avoid_state if cur_mode in [ACQUIRE, HUNT, PACK]: acquire_states[drone_id] = new_acquire_state if cur_mode in [HUNT, PACK]: hunt_states[drone_id] = new_hunt_state if cur_mode == PACK and (total_frame_ctr == 1 or replay_frame_ctr % PACK_EVAL_FRAMES == 0): drone_states[drone_id] = new_drone_state if drone_id == (NUM_DRONES - 1): pack_state = new_pack_state replay_frame_ctr = 0 cur_speeds[drone_id] = new_speed cum_rwd += new_reward # in case of crash, report and initialize if new_reward == -500 or new_reward == -1000: # Log the car's distance at this T. data_collect.append([total_frame_ctr, crash_frame_ctr]) # Update max. if crash_frame_ctr > max_crash_frame_ctr: max_crash_frame_ctr = crash_frame_ctr # Time it. tot_time = timeit.default_timer() - start_time fps = crash_frame_ctr / tot_time # Output some stuff so we can watch. #try: print( "Max: %d at %d\t eps: %f\t dist: %d\t mode: %d\t cum rwd: %d\t fps: %d" % (max_crash_frame_ctr, total_frame_ctr, epsilon, crash_frame_ctr, cur_mode, cum_rwd, int(fps))) # break #except (RuntimeError, TypeError, NameError): # pass # Reset. crash_frame_ctr = cum_rwd = cum_speed = 0 start_time = timeit.default_timer() #print(9) # decrement epsilon for another frame if epsilon > 0.1 and total_frame_ctr > observe: epsilon -= (1 / train_frames) if total_frame_ctr % 10000 == 0: if crash_frame_ctr != 0: #try: print( "Max: %d at %d\t eps: %f\t dist: %d\t mode: %d\t cum rwd: %d" % (max_crash_frame_ctr, total_frame_ctr, epsilon, crash_frame_ctr, cur_mode, cum_rwd)) # break #except (RuntimeError, TypeError, NameError): #pass # Save model every 50k frames if total_frame_ctr % 50000 == 0: save_init = False if cur_mode == TURN: turn_model.save_weights('models/turn/turn-' + filename + '-' + str(START_SPEED) + '-' + str(total_frame_ctr) + '.h5', overwrite=True) print("Saving turn_model %s - %d - %d" % (filename, START_SPEED, total_frame_ctr)) elif cur_mode == AVOID: avoid_model.save_weights('models/avoid/avoid-' + filename + '-' + str(total_frame_ctr) + '.h5', overwrite=True) print("Saving avoid_model %s - %d" % (filename, total_frame_ctr)) elif cur_mode == ACQUIRE: acquire_model.save_weights('models/acquire/acquire-' + filename + '-' + str(START_SPEED) + '-' + str(total_frame_ctr) + '.h5', overwrite=True) print("Saving acquire_model %s - %d" % (filename, total_frame_ctr)) elif cur_mode == HUNT: hunt_model.save_weights('models/hunt/hunt-' + filename + '-' + str(total_frame_ctr) + '.h5', overwrite=True) print("Saving hunt_model %s - %d" % (filename, total_frame_ctr)) elif cur_mode == PACK: pack_model.save_weights('models/pack/pack-' + filename + '-' + str(total_frame_ctr) + '.h5', overwrite=True) print("Saving pack_model %s - %d" % (filename, total_frame_ctr)) # Log results after we're done all frames. log_results(filename, data_collect, loss_log)
def train_net(model, params): global counter global lastState global last_action global lastreward filename = params_to_filename(params) observe = 1000 # Number of frames to observe before training.p epsilon = 1 train_frames = 1000000 # Number of frames to play. batchSize = params['batchSize'] buffer = params['buffer'] # Just stuff used below. max_car_distance = 0 car_distance = 0 t = 0 data_collect = [] replay = [] # stores tuples of (S, A, R, S'). loss_log = [] # Create a new game instance. game_state = carmunk.GameState() # Get initial state by doing nothing and getting the state. _, state = game_state.frame_step((2)) # state = np.array([14,14,14,14,14,14,14,14,14]) # state = np.expand_dims(state, axis = 0) # Let's time it. start_time = timeit.default_timer() # Run the frames. while t < train_frames: t += 1 car_distance += 1 # Choose an action. if random.random() < epsilon or t < observe: action = np.random.randint(0, 5) # random else: # Get Q values for each action. qval = model.predict(train_new_state, batch_size=1) action = (np.argmax(qval)) # best # Take action, observe new state and get our treat. if lastreward < -100: lastState = state train_state = np.append(lastState, state[0]) train_state = np.append(train_state, last_action) train_state = np.expand_dims(train_state, axis=0) reward, new_state = game_state.frame_step(action) train_new_state = np.append(state[0], new_state[0]) train_new_state = np.append(train_new_state, action) train_new_state = np.expand_dims(train_new_state, axis=0) if sum(state[0]) >= 42: counter += 1 if counter % 40 == 0: replay.append((train_state, action, reward, train_new_state)) if counter > 1000000000: counter = 0 else: replay.append((train_state, action, reward, train_new_state)) lastState = np.copy(state) state = np.copy(new_state) # Experience replay storage. last_action = action # If we're done observing, start training. if t > observe: # If we've stored enough in our buffer, pop the oldest. if len(replay) > buffer: replay.pop(0) # Randomly sample our experience replay memory minibatch = random.sample(replay, batchSize) # Get training values.batchSize X_train, y_train = process_minibatch(minibatch, model) # Train the model on this batch. history = LossHistory() batchSize1 = len(X_train) model.fit(X_train, y_train, batch_size=batchSize1, nb_epoch=1, verbose=0, callbacks=[history]) loss_log.append(history.losses) # Update the starting state with S'. # Decrement epsilon over time. if epsilon > 0.1 and t > observe: epsilon -= 5 * (1 / train_frames) # We died, so update stuff. lastreward = reward if reward == -500: # Log the car's distance at this T. data_collect.append([t, car_distance]) # Update max. if car_distance > max_car_distance: max_car_distance = car_distance # Time it. tot_time = timeit.default_timer() - start_time fps = car_distance / tot_time # Output some stuff so we can watch. print("Max: %d at %d\tepsilon %f\t(%d)\t%f fps" % (max_car_distance, t, epsilon, car_distance, fps)) # Reset. car_distance = 0 start_time = timeit.default_timer() # Save the model every 25,000 frames. if t % 10000 == 0: model.save_weights('saved-models/' + filename + '-' + str(t) + '.h5', overwrite=True) print("Saving model %s - %d" % (filename, t)) # Log results after we're done all frames. log_results(filename, data_collect, loss_log)
def train_net(model, params): filename = params_to_filename(params) observe = 1000 epsilon = 1 train_frames = 100000 batchSize = params['batchSize'] buffer = params['buffer'] max_car_distance = 0 car_distance = 0 t = 0 data_collect = [] replay = [] loss_log = [] game_state = UI.GameState() _, state = game_state.frame_step((2)) start_time = timeit.default_timer() while t < train_frames: t += 1 car_distance += 1 if random.random() < epsilon or t < observe: action = np.random.randint(0, 3) else: qval = model.predict(state, batch_size=1) action = (np.argmax(qval)) reward, new_state = game_state.frame_step(action) replay.append((state, action, reward, new_state)) if t > observe: if len(replay) > buffer: replay.pop(0) # Randomly sample our experience replay memory minibatch = random.sample(replay, batchSize) X_train, y_train = process_minibatch2(minibatch, model) history = LossHistory() model.fit(X_train, y_train, batch_size=batchSize, nb_epoch=1, verbose=0, callbacks=[history]) loss_log.append(history.losses) state = new_state if epsilon > 0.1 and t > observe: epsilon -= (1.0 / train_frames) if reward == -500: data_collect.append([t, car_distance]) if car_distance > max_car_distance: max_car_distance = car_distance tot_time = timeit.default_timer() - start_time fps = car_distance / tot_time print("Max: %d at %d\tepsilon %f\t(%d)\t%f fps" % (max_car_distance, t, epsilon, car_distance, fps)) car_distance = 0 start_time = timeit.default_timer() # Save the model every 25,000 frames. if t % 25000 == 0: model.save_weights('saved-models/' + filename + '-' + str(t) + '.h5', overwrite=True) print("Saving model %s - %d" % (filename, t)) log_results(filename, data_collect, loss_log)
def train_net(model, params): filename = params_to_filename(params) observe = 1000 # Number of frames to observe before training. epsilon = 1 train_frames = 110000 # Number of frames to play. steps = 0 batchSize = params['batchSize'] buffer = params['buffer'] # Just stuff used below. max_car_distance = 0 car_distance = 0 t = 0 data_collect = [] replay = [] # stores tuples of (S, A, R, S'). loss_log = [] # Create a new game instance. game_state = carmunk.GameState() # Get initial state by doing nothing and getting the state. _, state = game_state.frame_step((1)) # Let's time it. start_time = timeit.default_timer() # Run the frames. while t < train_frames: t += 1 car_distance += 1 # Choose an action. if random.random() < epsilon or t < observe: action = np.random.randint(3) # random else: # Get Q values for each action. qval = model.predict(state, batch_size=1) action = (np.argmax(qval)) # best # Take action, observe new state and get our treat. reward, new_state = game_state.frame_step(action) # Experience replay storage. replay.append((state, action, reward, new_state)) # If we're done observing, start training. if t > observe: #print("start") # If we've stored enough in our buffer, pop the oldest. if len(replay) > buffer: replay.pop(0) # Randomly sample our experience replay memory minibatch = random.sample(replay, batchSize) # Get training values. X_train, y_train = process_minibatch2(minibatch, model) # Train the model on this batch. history = LossHistory() model.fit(X_train, y_train, batch_size=batchSize, nb_epoch=1, verbose=0, callbacks=[history]) loss_log.append(history.losses) steps += 1 if steps % 1000 == 0: print("Step = " + str(steps), "Epsilon = " + str(epsilon)) # Update the starting state with S'. state = new_state # Decrement epsilon over time. if epsilon > 0.1 and t > observe: epsilon -= (1.0 / train_frames) # We died, so update stuff. if reward <= -500: #print("Crashed.") # Log the car's distance at this T. data_collect.append([t, car_distance]) # Reset. car_distance = 0 # We reached the goal, so update stuff. elif reward >= 2000: print("Reached goal.") # Log the car's distance at this T. data_collect.append([t, car_distance]) # Reset. car_distance = 0 # Save the model every 25,000 frames. if t % 25000 == 0: model.save_weights('saved-models/' + filename + '-' + str(t) + '.h5', overwrite=True) print("Saving model %s - %d" % (filename, t))