def __init__(self): self.game = carmunk.GameState() self.episodes_length = 10000 self.nn = NN(7, 3) self.gamma = 0.9 # Generate the necessary tensorflow ops self.inputs1, self.nextQ = self.nn.placeholder_inputs(None) self.Qout = self.nn.inference(self.inputs1, 128, 32) self.loss = self.nn.loss_val(self.Qout, self.nextQ) self.train_op = self.nn.training(self.loss, learning_rate=0.01) self.time_per_epoch = tf.placeholder(tf.float32, shape=()) self.init = tf.initialize_all_variables() self.saver = tf.train.Saver() # Generate the requisite buffer self.experience_memory = 10000 self.replay = [] self.minibatch_size = 128 self.epsilon = 0.9 # self.saver.restore(self.sess, "newmodel1.ckpt") self.logs_path = '/tmp/tensorflow_logs/example21' # Create a summary to monitor loss tensor tf.scalar_summary("timeperepoch", self.time_per_epoch) # Merge all summaries into a single op self.merged_summary_op = tf.merge_all_summaries()
def play(model, weights): car_distance = 0 game_state = carmunk.GameState(weights) _, state, __ = game_state.frame_step((2)) featureExpectations = np.zeros(len(weights)) # Move. dib book #time.sleep(15) while True: time.sleep(0.01) car_distance += 1 # Choose action. action = (np.argmax(model.predict(state, batch_size=1))) #print ("Action ", action) # Take action. immediateReward, state, readings = game_state.frame_step(action) #print ("immeditate reward:: ", immediateReward) #print ("readings :: ", readings) #start recording feature expectations only after 100 frames if car_distance > 100: featureExpectations += (GAMMA**(car_distance - 101)) * np.array(readings) #print ("Feature Expectations :: ", featureExpectations) # Tell us something. if car_distance % 2000 == 0: print("Current distance: %d frames." % car_distance) break return featureExpectations
def play(model): car_distance = 0 game_state = carmunk.GameState() four_states = [[None] * 3] * 4 # Do nothing to get initial. _, start_state = game_state.frame_step((2)) rotate_state(four_states, start_state) # Move. while True: car_distance += 1 flat_four = np.array([flatten_state(four_states)]) # Choose action. action = (np.argmax(model.predict(flat_four, batch_size=1))) # Take action. _, state = game_state.frame_step(action) rotate_state(four_states, state) # Tell us something. if car_distance % 1000 == 0: print("Current distance: %d frames." % car_distance)
def play(model): car_distance = 0 game_state = carmunk.GameState() # Do nothing to get initial. _, state = game_state.frame_step((1)) # Move. while True: car_distance += 1 # Choose action. #action = (np.argmax(model.predict(state, batch_size=1))) # Take action. if random.random() < 0.099: action = np.random.randint(3) # random else: # Get Q values for each action. qval = model.predict(state, batch_size=1) action = (np.argmax(qval)) # best _, state = game_state.frame_step(action) # Tell us something. if car_distance % 1000 == 0: print("Current distance: %d frames." % car_distance)
def play(screen): sess = tf.InteractiveSession() saved_model = 'saved-models_brown/evaluatedPolicies/1-164-150-100-50000-100000.h5' model = Policy_Network(NUM_STATES, [164, 150], sess, saved_model) car_distance = 0 weights = [ -0.26275824, 0.03635492, 0.09312051, 0.00469211, -0.18295909, 0.6987476, -0.59225824, -0.2201157 ] #brown # weights = [-0.06099233, -0.20316265, -0.1427778, -0.16924885, 0.25280695, -0.0025343, 0.30678838, -0.86483369] # weights = [1, 1, 1, 1, 1, 1, 1, 1]# just some random weights, does not matter in calculation of the feature expectations game_state = carmunk.GameState(weights, [0, 0, 1, 0]) _, state, __ = game_state.frame_step((2)) featureExpectations = np.zeros(len(weights)) Prev = np.zeros(len(weights)) replay = [] while True: car_distance += 1 event = screen.getch() if event == curses.KEY_LEFT: action = 1 elif event == curses.KEY_RIGHT: action = 0 elif event == curses.KEY_DOWN: break else: action = 2 # Take action. #start recording feature expectations only after 100 frames immediateReward, new_state, readings = game_state.frame_step(action) replay.append((state, action, immediateReward, new_state)) state = new_state if car_distance > 100: featureExpectations += (GAMMA**(car_distance - 101)) * np.array(readings) # Tell us something. changePercentage = (np.linalg.norm(featureExpectations - Prev) * 100.0) / np.linalg.norm(featureExpectations) print(car_distance) print("percentage change in Feature expectation ::", changePercentage) Prev = np.array(featureExpectations) if car_distance % 300 == 0: break Xtrain, Ytrain = process_minibatch(replay, model) np.save('xtrain_brown.npy', Xtrain) np.save('ytrain_brown.npy', Ytrain) return featureExpectations
def play(screen): car_distance = 0 weights = [ 1, 1, 1, 1, 1, 1, 1, 1 ] # just some random weights, does not matter in calculation of the feature expectations game_state = carmunk.GameState(weights) _, state, __ = game_state.frame_step((2)) featureExpectations = np.zeros(len(weights)) Prev = np.zeros(len(weights)) while True: car_distance += 1 event = screen.getch() if event == curses.KEY_LEFT: action = 1 elif event == curses.KEY_RIGHT: action = 0 elif event == curses.KEY_DOWN: break else: action = 2 # Take action. #start recording feature expectations only after 100 frames immediateReward, state, readings = game_state.frame_step(action) if car_distance > 100: featureExpectations += (GAMMA**(car_distance - 101)) * np.array(readings) # Tell us something. changePercentage = (np.linalg.norm(featureExpectations - Prev) * 100.0) / np.linalg.norm(featureExpectations) print(car_distance) print("percentage change in Feature expectation ::", changePercentage) Prev = np.array(featureExpectations) if car_distance % 2000 == 0: break return featureExpectations
def play(model): car_distance = 0 game_state = carmunk.GameState() # Do nothing to get initial. _, state = game_state.frame_step((2)) # Move. while True: car_distance += 1 # Choose action. action = (np.argmax(model.predict(state, batch_size=1))) # Take action. _, state = game_state.frame_step(action) # Tell us something. if car_distance % 1000 == 0: print("Current distance: %d frames." % car_distance)
def play(model): car_distance = 0 game_state = carmunk.GameState() # Do nothing to get initial. state, _, speed, _, _, _ = game_state.frame_step(START_ACTION, START_SPEED, START_DISTANCE) # Move. while True: car_distance += 1 # Choose action. action = (np.argmax(model.predict(state, batch_size=1))) # Take action. state, _, speed, _, _, _ = game_state.frame_step(action, speed, car_distance) # Tell us something. if car_distance % 1000 == 0: print("Current distance: %d frames." % car_distance)
def play(model): car_distance = 0 game_state = carmunk.GameState() # Do nothing to get initial. reward, state = game_state.frame_step((2)) # Change this to "whilte True" to make it never die. while reward != -500: car_distance += 1 # Choose action. action = (np.argmax(model.predict(state, batch_size=1))) # Take action. reward, state = game_state.frame_step(action) # Tell us something. if car_distance % 1000 == 0: print("Current distance: %d frames." % car_distance) print("Made it %d frames." % car_distance)
def play(model, weights, sess=None): car_distance = 0 game_state = carmunk.GameState(weights, [1, 0, 0, 0]) _, state, __ = game_state.frame_step((2)) # state = state + [1,0,0,0] featureExpectations = np.zeros(len(weights)) # Move. #time.sleep(15) while True: car_distance += 1 # Choose action. action = (np.argmax(model.predict(state))) # F = compute_fisher(model, [state], sess) # print(F) #print ("Action ", action) # Take action. immediateReward, state, readings = game_state.frame_step(action) # state = state + [1,0,0,0] #print ("immeditate reward:: ", immediateReward) #print ("readings :: ", readings) #start recording feature expectations only after 100 frames if car_distance > 100: featureExpectations += (GAMMA**(car_distance - 101)) * np.array(readings) #print ("Feature Expectations :: ", featureExpectations) # Tell us something. if car_distance % 2000 == 0: print("Current distance: %d frames." % car_distance) break return featureExpectations
def train_net(model, params): filename = params_to_filename(params) observe = 1000 # Number of frames to observe before training. epsilon = 1 train_frames = 110000 # Number of frames to play. steps = 0 batchSize = params['batchSize'] buffer = params['buffer'] # Just stuff used below. max_car_distance = 0 car_distance = 0 t = 0 data_collect = [] replay = [] # stores tuples of (S, A, R, S'). loss_log = [] # Create a new game instance. game_state = carmunk.GameState() # Get initial state by doing nothing and getting the state. _, state = game_state.frame_step((1)) # Let's time it. start_time = timeit.default_timer() # Run the frames. while t < train_frames: t += 1 car_distance += 1 # Choose an action. if random.random() < epsilon or t < observe: action = np.random.randint(3) # random else: # Get Q values for each action. qval = model.predict(state, batch_size=1) action = (np.argmax(qval)) # best # Take action, observe new state and get our treat. reward, new_state = game_state.frame_step(action) # Experience replay storage. replay.append((state, action, reward, new_state)) # If we're done observing, start training. if t > observe: #print("start") # If we've stored enough in our buffer, pop the oldest. if len(replay) > buffer: replay.pop(0) # Randomly sample our experience replay memory minibatch = random.sample(replay, batchSize) # Get training values. X_train, y_train = process_minibatch2(minibatch, model) # Train the model on this batch. history = LossHistory() model.fit(X_train, y_train, batch_size=batchSize, nb_epoch=1, verbose=0, callbacks=[history]) loss_log.append(history.losses) steps += 1 if steps % 1000 == 0: print("Step = " + str(steps), "Epsilon = " + str(epsilon)) # Update the starting state with S'. state = new_state # Decrement epsilon over time. if epsilon > 0.1 and t > observe: epsilon -= (1.0 / train_frames) # We died, so update stuff. if reward <= -500: #print("Crashed.") # Log the car's distance at this T. data_collect.append([t, car_distance]) # Reset. car_distance = 0 # We reached the goal, so update stuff. elif reward >= 2000: print("Reached goal.") # Log the car's distance at this T. data_collect.append([t, car_distance]) # Reset. car_distance = 0 # Save the model every 25,000 frames. if t % 25000 == 0: model.save_weights('saved-models/' + filename + '-' + str(t) + '.h5', overwrite=True) print("Saving model %s - %d" % (filename, t))
def train_net(model, params): filename = params_to_filename(params) observe = 1000 # Number of frames to observe before training. epsilon = 1 train_frames = 100000 # Number of frames to play. batchSize = params['batchSize'] buffer = params['buffer'] # Just stuff used below. max_car_distance = 0 car_distance = 0 min_distance = 10000 t = 0 data_collect = [] replay = [] # stores tuples of (S, A, R, S'). loss_log = [] # Create a new game instance. game_state = carmunk.GameState() # Get initial state by doing nothing and getting the state. _, state,_= game_state.frame_step((2)) # Let's time it. start_time = timeit.default_timer() # Run the frames. while t < train_frames: t += 1 car_distance += 1 # Choose an action. # if random.random() < epsilon or t < observe: # action = np.random.randint(0, 3) # random # else: # Get Q values for each action. qval = model.predict(state, batch_size=1) action = (np.argmax(qval)) # best # Take action, observe new state and get our treat. reward, new_state, distance = game_state.frame_step(action) # Experience replay storage. replay.append((state, action, reward, new_state)) # If we're done observing, start training. if t > observe: # If we've stored enough in our buffer, pop the oldest. if len(replay) > buffer: replay.pop(0) # Randomly sample our experience replay memory minibatch = random.sample(replay, batchSize) # mot liss gom 64 tuble, moi tuble gom 4 phan tu (S, A, R, S') # Get training values. X_train, y_train = process_minibatch2(minibatch, model) # Train the model on this batch. history = LossHistory() model.fit( X_train, y_train, batch_size=batchSize, nb_epoch=1, verbose=0, callbacks=[history] ) loss_log.append(history.losses) # Update the starting state with S'. state = new_state # Decrement epsilon over time. if epsilon > 0.1 and t > observe: epsilon -= (1.0/train_frames) if distance < min_distance: min_distance = distance # We died, so update stuff. if reward == -500: # Log the car's distance at this T. data_collect.append([t, car_distance]) # Update max. if car_distance > max_car_distance: max_car_distance = car_distance # Time it. tot_time = timeit.default_timer() - start_time fps = car_distance / tot_time # Output some stuff so we can watch. # print("Max_car_distance: %d at %d\tepsilon %f\t(%d)\tdistance %d\t%f fps" % # (max_car_distance, t, epsilon, car_distance, distance, fps)) # Reset. car_distance = 0 start_time = timeit.default_timer() if t % 10 == 0: print("Max_car_distance: %d at %d\tepsilon %f\t(%d)\tdistance %d \tmin_distance %d" % (max_car_distance, t, epsilon, car_distance, distance, min_distance)) # Save the model every 25,000 frames. if t % 10000 == 0: model.save_weights('saved-models/' + filename + '-' + str(t) + '.h5', overwrite=True) print("Saving model %s - %d" % (filename, t)) # Log results after we're done all frames. log_results(filename, data_collect, loss_log)
def train_net(model): observe = 1000 # Number of frames to observe before training. epochs = 1000 # Number of games to play. epsilon = 1 batchSize = 40 # buffer = 50000 buffer = 5000 # Just stuff used below. max_car_distance = 0 t = 0 data_collect = [] replay = [] # stores tuples of (S, A, R, S'). for i in range(epochs): # Create a new game instance. game_state = carmunk.GameState() status = 1 # Get initial state by doing nothing and getting the state. _, state = game_state.frame_step((2)) car_distance = 0 # Reset. while status == 1: t += 1 car_distance += 1 # Get Q values for each action. qval = model.predict(state, batch_size=1) # Choose an action. if random.random() < epsilon or t < observe: action = np.random.randint(0, 3) # random else: action = (np.argmax(qval)) # best # Take action, observe new state and get our treat. reward, new_state = game_state.frame_step(action) # Experience replay storage. replay.append((state, action, reward, new_state)) # If we're done observing, start training. if t > observe: # If we've stored enough in our buffer, pop the oldest. if len(replay) > buffer: replay.pop(0) # Randomly sample our experience replay memory minibatch = random.sample(replay, batchSize) # Get training values. X_train, y_train = process_minibatch(minibatch) # Train the model on this batch. model.fit(X_train, y_train, batch_size=batchSize, nb_epoch=1, verbose=0) # Update the starting state with S'. state = new_state # We died, so update stuff. if reward == -500: status = 0 if car_distance > max_car_distance: max_car_distance = car_distance # Save the model. model.save_weights('saved-models/model-weights-' + str(car_distance) + '.h5', overwrite=True) # Decrement epsilon over time. if epsilon > 0.1 and t > observe: epsilon -= (1 / epochs) # Log the car's distance at this T. data_collect.append([t, car_distance]) print("Max: %d at %d\tgame %d\tepsilon %f\t(%d)" % (max_car_distance, t, i, epsilon, car_distance)) # Save the results to a file so we can graph it later. data_dump = open('results/learn_data-' + str(t) + '.csv', 'w') wr = csv.writer(data_dump) wr.writerows(data_collect) # Save a last version of the model. model.save_weights('saved-models/model-weights-' + str(t) + '.h5', overwrite=True)
def train_net(model, params): global counter global lastState global last_action global lastreward filename = params_to_filename(params) observe = 1000 # Number of frames to observe before training.p epsilon = 1 train_frames = 1000000 # Number of frames to play. batchSize = params['batchSize'] buffer = params['buffer'] # Just stuff used below. max_car_distance = 0 car_distance = 0 t = 0 data_collect = [] replay = [] # stores tuples of (S, A, R, S'). loss_log = [] # Create a new game instance. game_state = carmunk.GameState() # Get initial state by doing nothing and getting the state. _, state = game_state.frame_step((2)) # state = np.array([14,14,14,14,14,14,14,14,14]) # state = np.expand_dims(state, axis = 0) # Let's time it. start_time = timeit.default_timer() # Run the frames. while t < train_frames: t += 1 car_distance += 1 # Choose an action. if random.random() < epsilon or t < observe: action = np.random.randint(0, 5) # random else: # Get Q values for each action. qval = model.predict(train_new_state, batch_size=1) action = (np.argmax(qval)) # best # Take action, observe new state and get our treat. if lastreward < -100: lastState = state train_state = np.append(lastState, state[0]) train_state = np.append(train_state, last_action) train_state = np.expand_dims(train_state, axis=0) reward, new_state = game_state.frame_step(action) train_new_state = np.append(state[0], new_state[0]) train_new_state = np.append(train_new_state, action) train_new_state = np.expand_dims(train_new_state, axis=0) if sum(state[0]) >= 42: counter += 1 if counter % 40 == 0: replay.append((train_state, action, reward, train_new_state)) if counter > 1000000000: counter = 0 else: replay.append((train_state, action, reward, train_new_state)) lastState = np.copy(state) state = np.copy(new_state) # Experience replay storage. last_action = action # If we're done observing, start training. if t > observe: # If we've stored enough in our buffer, pop the oldest. if len(replay) > buffer: replay.pop(0) # Randomly sample our experience replay memory minibatch = random.sample(replay, batchSize) # Get training values.batchSize X_train, y_train = process_minibatch(minibatch, model) # Train the model on this batch. history = LossHistory() batchSize1 = len(X_train) model.fit(X_train, y_train, batch_size=batchSize1, nb_epoch=1, verbose=0, callbacks=[history]) loss_log.append(history.losses) # Update the starting state with S'. # Decrement epsilon over time. if epsilon > 0.1 and t > observe: epsilon -= 5 * (1 / train_frames) # We died, so update stuff. lastreward = reward if reward == -500: # Log the car's distance at this T. data_collect.append([t, car_distance]) # Update max. if car_distance > max_car_distance: max_car_distance = car_distance # Time it. tot_time = timeit.default_timer() - start_time fps = car_distance / tot_time # Output some stuff so we can watch. print("Max: %d at %d\tepsilon %f\t(%d)\t%f fps" % (max_car_distance, t, epsilon, car_distance, fps)) # Reset. car_distance = 0 start_time = timeit.default_timer() # Save the model every 25,000 frames. if t % 10000 == 0: model.save_weights('saved-models/' + filename + '-' + str(t) + '.h5', overwrite=True) print("Saving model %s - %d" % (filename, t)) # Log results after we're done all frames. log_results(filename, data_collect, loss_log)
reward_sum = 0 episode_number = 0 policy_network = PolicyNetwork(learning_rate) # saver saver = tf.train.Saver() # session sess = tf.Session() sess.run(tf.global_variables_initializer()) if resume: saver.restore(sess, model_path) # create a new game instance env = carmunk.GameState() done = False # get initial state by doing nothing and getting the state _, observation = env.frame_step(2) while episode_number < max_episode_number: current_state = observation #print (current_state) # forward the policy network and sample an action from the returned probability action_prob = policy_network.predict(current_state[np.newaxis, :], sess) action = np.random.choice(a=3, p=action_prob.ravel())
def train_net(turn_model, turn_model_30, turn_model_50, turn_model_70, avoid_model, acquire_model, acquire_model_30, acquire_model_50, acquire_model_70, hunt_model, pack_model, params): filename = params_to_filename(params) if cur_mode in [TURN, HUNT, PACK]: observe = 2000 # Number of frames to observe before training. else: observe = 2000 epsilon = 1 # vary this based on pre-learning already occurred in lower models train_frames = 750000 # number of flips for training batchSize = params['batchSize'] buffer = params['buffer'] # initialize variables and structures used below. max_crash_frame_ctr = 0 crash_frame_ctr = 0 total_frame_ctr = 0 replay_frame_ctr = 0 stop_ctr = 0 avoid_ctr = 0 acquire_ctr = 0 cum_rwd = 0 cum_speed = 0 data_collect = [] replay = [] loss_log = [] # replay stores state, action, reward, new state save_init = True cur_speeds = [] for i in range(NUM_DRONES): cur_speeds.append(START_SPEED) # initialize drone state holders turn_states = np.zeros( [NUM_DRONES, TURN_TOTAL_SENSORS * TURN_STATE_FRAMES]) avoid_states = np.zeros( [NUM_DRONES, AVOID_TOTAL_SENSORS * AVOID_STATE_FRAMES]) acquire_states = np.zeros( [NUM_DRONES, ACQUIRE_NUM_SENSOR * ACQUIRE_STATE_FRAMES]) hunt_states = np.zeros( [NUM_DRONES, HUNT_TOTAL_SENSORS * HUNT_STATE_FRAMES]) drone_states = np.zeros( [NUM_DRONES, DRONE_TOTAL_SENSOR * PACK_STATE_FRAMES]) # create game instance game_state = carmunk.GameState() # get initial state(s) turn_state, avoid_state, acquire_state, hunt_state, drone_state, reward, cur_speed = \ game_state.frame_step(START_DRONE_ID, START_TURN_ACTION, START_SPEED_ACTION, START_PACK_ACTION, START_SPEED, START_DISTANCE, 1) # initialize frame states if cur_mode in [TURN, AVOID, HUNT, PACK]: for i in range(NUM_DRONES): turn_states[i] = state_frames( turn_state, np.zeros((1, TURN_TOTAL_SENSORS * TURN_STATE_FRAMES)), TURN_TOTAL_SENSORS, TURN_STATE_FRAMES) if cur_mode in [AVOID, HUNT, PACK]: for i in range(NUM_DRONES): avoid_states[i] = state_frames( avoid_state, np.zeros((1, AVOID_TOTAL_SENSORS * AVOID_STATE_FRAMES)), AVOID_TOTAL_SENSORS, AVOID_STATE_FRAMES) if cur_mode in [ACQUIRE, HUNT, PACK]: for i in range(NUM_DRONES): acquire_states[i] = state_frames( acquire_state, np.zeros((1, ACQUIRE_NUM_SENSOR * ACQUIRE_STATE_FRAMES)), ACQUIRE_NUM_SENSOR, ACQUIRE_STATE_FRAMES) if cur_mode in [HUNT, PACK]: for i in range(NUM_DRONES): hunt_states[i] = state_frames( hunt_state, np.zeros((1, HUNT_TOTAL_SENSORS * HUNT_STATE_FRAMES)), HUNT_TOTAL_SENSORS, HUNT_STATE_FRAMES) if cur_mode == PACK: for i in range(NUM_DRONES): drone_states[i] = state_frames( drone_state, np.zeros((1, DRONE_TOTAL_SENSOR * PACK_STATE_FRAMES)), DRONE_TOTAL_SENSOR, PACK_STATE_FRAMES) pack_state = state_frames( drone_state, np.zeros((1, PACK_TOTAL_SENSORS * PACK_STATE_FRAMES)), PACK_TOTAL_SENSORS, PACK_STATE_FRAMES) # time it start_time = timeit.default_timer() # run frames while total_frame_ctr < train_frames: total_frame_ctr += 1 # counts total training distance traveled crash_frame_ctr += 1 # counts distance between crashes replay_frame_ctr += 1 # counts frames between pack mode replay captures # used to slow things down for de-bugging #time.sleep(0.25) for drone_id in range( NUM_DRONES): # NUM_DRONES = 1, unless you're in PACK mode speed_action = START_SPEED_ACTION # choose appropriate action(s) # note: only generates random inputs for currently training model. # all prior (sub) models provide their best (fully-trained) inputs if random.random( ) < epsilon or total_frame_ctr < observe: # epsilon degrades over flips... if cur_mode == TURN: turn_action = set_turn_action( True, cur_speeds[drone_id], np.array([turn_states[drone_id]])) else: if cur_mode in [AVOID, HUNT, PACK]: turn_action, turn_model = set_turn_action( False, cur_speeds[drone_id], np.array([turn_states[drone_id]])) if cur_mode == AVOID: speed_action = set_avoid_action( True, turn_action, np.array([avoid_states[drone_id]])) else: if cur_mode in [HUNT, PACK]: speed_action = set_avoid_action( False, turn_action, np.array([avoid_states[drone_id]])) if cur_mode == ACQUIRE: acquire_action = set_acquire_action( True, cur_speeds[drone_id], np.array([acquire_states[drone_id, ]])) turn_action = acquire_action else: acquire_action, acquire_model = set_acquire_action( False, cur_speeds[drone_id], np.array([acquire_states[drone_id, ]])) if cur_mode == HUNT: hunt_action, turn_action, speed_action = set_hunt_action( True, cur_speeds[drone_id], turn_action, speed_action, acquire_action, np.array([hunt_states[drone_id, ]])) else: hunt_action, turn_action, speed_action = set_hunt_action( False, cur_speeds[drone_id], turn_action, speed_action, acquire_action, np.array([hunt_states[drone_id, ]])) if cur_mode == PACK and ( total_frame_ctr == 1 or (replay_frame_ctr - 1) % PACK_EVAL_FRAMES == 0) and drone_id == 0: pack_action = set_pack_action( True, pack_state) # note: pack action only changed every PACK_EVAL_FRAMES. # for frames in between it's constant else: # ...increasing use of predictions over time if cur_mode == TURN: turn_action, turn_model = set_turn_action( False, cur_speeds[drone_id], np.array([turn_states[drone_id]])) else: if cur_mode in [AVOID, HUNT, PACK]: turn_action, turn_model = set_turn_action( False, cur_speeds[drone_id], np.array([turn_states[drone_id]])) if cur_mode == AVOID: speed_action = set_avoid_action( False, turn_action, np.array([avoid_states[drone_id]])) else: if cur_mode in [HUNT, PACK]: speed_action = set_avoid_action( False, turn_action, np.array([avoid_states[drone_id]])) if cur_mode == ACQUIRE: acquire_action, acquire_model = set_acquire_action( False, cur_speeds[drone_id], np.array([acquire_states[drone_id, ]])) turn_action = acquire_action else: acquire_action, acquire_model = set_acquire_action( False, cur_speeds[drone_id], np.array([acquire_states[drone_id, ]])) if cur_mode == HUNT: hunt_action, turn_action, speed_action = set_hunt_action( False, cur_speeds[drone_id], turn_action, speed_action, acquire_action, np.array([hunt_states[drone_id, ]])) else: hunt_action, turn_action, speed_action = set_hunt_action( False, cur_speeds[drone_id], turn_action, speed_action, acquire_action, np.array([hunt_states[drone_id, ]])) if cur_mode == PACK and ( total_frame_ctr == 1 or (replay_frame_ctr - 1) % PACK_EVAL_FRAMES == 0) and drone_id == 0: # get 1 pack action for each set of drones on first drone pack_action = set_pack_action( False, pack_state) print(pack_action) #print("++++++ pack action:", pack_action) #print(2) # pass action, receive new state, reward new_turn_state, new_avoid_state, new_acquire_state, new_hunt_state, new_drone_state, new_reward, new_speed = game_state.frame_step( drone_id, turn_action, speed_action, pack_action, cur_speeds[drone_id], total_frame_ctr, replay_frame_ctr) #print("********** 2. new states / rewards:") #print(total_frame_ctr) #print(drone_id) #print(new_drone_state) #print(new_reward) #print(3) # append (horizontally) historical states for learning speed. """ note: do this concatination even for models that are not learning (e.g., turn when running search or turn, search and acquire while running hunt) b/c their preds, performed above, expect the same multi-frame view that was in place when they trained.""" if cur_mode in [TURN, AVOID, HUNT, PACK]: new_turn_state = state_frames( new_turn_state, np.array([turn_states[drone_id]]), TURN_TOTAL_SENSORS, TURN_STATE_FRAMES) if cur_mode in [AVOID, HUNT, PACK]: new_avoid_state = state_frames( new_avoid_state, np.array([avoid_states[drone_id]]), AVOID_TOTAL_SENSORS, AVOID_STATE_FRAMES) if cur_mode in [ACQUIRE, HUNT, PACK]: new_acquire_state = state_frames( new_acquire_state, np.array([acquire_states[drone_id]]), ACQUIRE_NUM_SENSOR, ACQUIRE_STATE_FRAMES) if cur_mode in [HUNT, PACK]: new_hunt_state = state_frames( new_hunt_state, np.array([hunt_states[drone_id]]), HUNT_TOTAL_SENSORS, HUNT_STATE_FRAMES) #print(4) if cur_mode == PACK and (total_frame_ctr == 1 or replay_frame_ctr % PACK_EVAL_FRAMES == 0): if drone_id == 0: # for 1st drone, pack state = drone state new_pack_state = new_drone_state pack_rwd = new_reward else: # otherwise, append drone record to prior drone state new_pack_state = state_frames(new_pack_state, new_drone_state, DRONE_TOTAL_SENSOR, 2) pack_rwd += new_reward new_drone_state = state_frames( new_drone_state, np.array([drone_states[drone_id]]), DRONE_TOTAL_SENSOR, PACK_STATE_FRAMES) if drone_id == (NUM_DRONES - 1): # for last drone build pack record if total_frame_ctr == 1: pack_state = np.zeros( (1, PACK_TOTAL_SENSORS * PACK_STATE_FRAMES)) new_pack_state = state_frames( new_pack_state, pack_state, PACK_TOTAL_SENSORS, PACK_STATE_FRAMES ) #may need to add 1 to PACK_STATE_FRAMES #print("**** 3. final pack reward:") #print(pack_rwd) #print(5) # experience replay storage """note: only the model being trained requires event storage as it is stack that will be sampled for training below.""" if cur_mode == TURN: replay.append((np.array([turn_states[drone_id]]), turn_action, new_reward, new_turn_state)) elif cur_mode == AVOID: replay.append((np.array([avoid_states[drone_id]]), speed_action, new_reward, new_avoid_state)) elif cur_mode == ACQUIRE: replay.append((np.array([acquire_states[drone_id]]), turn_action, new_reward, new_acquire_state)) elif cur_mode == HUNT: replay.append((np.array([hunt_states[drone_id]]), hunt_action, new_reward, new_hunt_state)) elif cur_mode == PACK and (total_frame_ctr == 1 or replay_frame_ctr % PACK_EVAL_FRAMES == 0) and drone_id == (NUM_DRONES - 1): replay.append( (pack_state, pack_action, pack_rwd, new_pack_state)) #print(replay[-1]) #print("6a") # If we're done observing, start training. if total_frame_ctr > observe and ( cur_mode != PACK or (replay_frame_ctr % PACK_EVAL_FRAMES == 0 and drone_id == (NUM_DRONES - 1))): # If we've stored enough in our buffer, pop the oldest. if len(replay) > buffer: replay.pop(0) # Randomly sample our experience replay memory minibatch = random.sample(replay, batchSize) if cur_mode == TURN: # Get training values. X_train, y_train = process_minibatch( minibatch, turn_model, TURN_NUM_INPUT, TURN_NUM_OUTPUT) history = LossHistory() turn_model.fit(X_train, y_train, batch_size=batchSize, nb_epoch=1, verbose=0, callbacks=[history]) elif cur_mode == AVOID: X_train, y_train = process_minibatch( minibatch, avoid_model, AVOID_NUM_INPUT, AVOID_NUM_OUTPUT) history = LossHistory() avoid_model.fit(X_train, y_train, batch_size=batchSize, nb_epoch=1, verbose=0, callbacks=[history]) elif cur_mode == ACQUIRE: X_train, y_train = process_minibatch( minibatch, acquire_model, ACQUIRE_NUM_INPUT, ACQUIRE_NUM_OUTPUT) history = LossHistory() acquire_model.fit(X_train, y_train, batch_size=batchSize, nb_epoch=1, verbose=0, callbacks=[history]) elif cur_mode == HUNT: X_train, y_train = process_minibatch( minibatch, hunt_model, HUNT_NUM_INPUT, HUNT_NUM_OUTPUT) history = LossHistory() hunt_model.fit(X_train, y_train, batch_size=batchSize, nb_epoch=1, verbose=0, callbacks=[history]) elif cur_mode == PACK: X_train, y_train = process_minibatch( minibatch, pack_model, PACK_NUM_INPUT, PACK_NUM_OUTPUT) history = LossHistory() pack_model.fit(X_train, y_train, batch_size=batchSize, nb_epoch=1, verbose=0, callbacks=[history]) loss_log.append(history.losses) # Update the starting state with S'. if cur_mode in [TURN, AVOID, HUNT, PACK]: turn_states[drone_id] = new_turn_state if cur_mode in [AVOID, HUNT, PACK]: avoid_states[drone_id] = new_avoid_state if cur_mode in [ACQUIRE, HUNT, PACK]: acquire_states[drone_id] = new_acquire_state if cur_mode in [HUNT, PACK]: hunt_states[drone_id] = new_hunt_state if cur_mode == PACK and (total_frame_ctr == 1 or replay_frame_ctr % PACK_EVAL_FRAMES == 0): drone_states[drone_id] = new_drone_state if drone_id == (NUM_DRONES - 1): pack_state = new_pack_state replay_frame_ctr = 0 cur_speeds[drone_id] = new_speed cum_rwd += new_reward # in case of crash, report and initialize if new_reward == -500 or new_reward == -1000: # Log the car's distance at this T. data_collect.append([total_frame_ctr, crash_frame_ctr]) # Update max. if crash_frame_ctr > max_crash_frame_ctr: max_crash_frame_ctr = crash_frame_ctr # Time it. tot_time = timeit.default_timer() - start_time fps = crash_frame_ctr / tot_time # Output some stuff so we can watch. #try: print( "Max: %d at %d\t eps: %f\t dist: %d\t mode: %d\t cum rwd: %d\t fps: %d" % (max_crash_frame_ctr, total_frame_ctr, epsilon, crash_frame_ctr, cur_mode, cum_rwd, int(fps))) # break #except (RuntimeError, TypeError, NameError): # pass # Reset. crash_frame_ctr = cum_rwd = cum_speed = 0 start_time = timeit.default_timer() #print(9) # decrement epsilon for another frame if epsilon > 0.1 and total_frame_ctr > observe: epsilon -= (1 / train_frames) if total_frame_ctr % 10000 == 0: if crash_frame_ctr != 0: #try: print( "Max: %d at %d\t eps: %f\t dist: %d\t mode: %d\t cum rwd: %d" % (max_crash_frame_ctr, total_frame_ctr, epsilon, crash_frame_ctr, cur_mode, cum_rwd)) # break #except (RuntimeError, TypeError, NameError): #pass # Save model every 50k frames if total_frame_ctr % 50000 == 0: save_init = False if cur_mode == TURN: turn_model.save_weights('models/turn/turn-' + filename + '-' + str(START_SPEED) + '-' + str(total_frame_ctr) + '.h5', overwrite=True) print("Saving turn_model %s - %d - %d" % (filename, START_SPEED, total_frame_ctr)) elif cur_mode == AVOID: avoid_model.save_weights('models/avoid/avoid-' + filename + '-' + str(total_frame_ctr) + '.h5', overwrite=True) print("Saving avoid_model %s - %d" % (filename, total_frame_ctr)) elif cur_mode == ACQUIRE: acquire_model.save_weights('models/acquire/acquire-' + filename + '-' + str(START_SPEED) + '-' + str(total_frame_ctr) + '.h5', overwrite=True) print("Saving acquire_model %s - %d" % (filename, total_frame_ctr)) elif cur_mode == HUNT: hunt_model.save_weights('models/hunt/hunt-' + filename + '-' + str(total_frame_ctr) + '.h5', overwrite=True) print("Saving hunt_model %s - %d" % (filename, total_frame_ctr)) elif cur_mode == PACK: pack_model.save_weights('models/pack/pack-' + filename + '-' + str(total_frame_ctr) + '.h5', overwrite=True) print("Saving pack_model %s - %d" % (filename, total_frame_ctr)) # Log results after we're done all frames. log_results(filename, data_collect, loss_log)
def train_net(model, params): filename = params_to_filename(params) observe = 129 # Number of frames to observe before training. epsilon = 0.5 train_frames = 50000 # Number of frames to play. steps = 0 batchSize = params['batchSize'] buffer = params['buffer'] # Just stuff used below. max_car_distance = 0 car_distance = 0 t = 0 data_collect = [] replay = [] # stores tuples of (S, A, R, S'). #to be displayed loss_log = [] # Create a new game instance. game_state = carmunk.GameState() # Get initial state by doing nothing and getting the state. _, state, _ = game_state.frame_step((2)) # Let's time it. start_time = timeit.default_timer() # Run the frames. while t < train_frames: print(t) t += 1 car_distance += 1 # Choose an action. if random.random() < epsilon or t < observe: action = np.random.randint(0, 5) # random else: # Get Q values for each action. print("PREDICTED", state) # time.sleep(1) x = state[0] y = state[1] qval = model.predict(np.array([x, y]).reshape((1, 2)), batch_size=1) action = (np.argmax(qval)) # best # Take action, observe new state and get our treat. reward, new_state, term = game_state.frame_step(action) print("timestep :" + str(t) + "Reward" + str(reward) + "action" + str(action) + "state" + str(state)) # Experience replay storage. replay.append((state, action, reward, new_state)) # print(len(replay)) # If we're done observing, start training. if t > observe: #print("start") # If we've stored enough in our buffer, pop the oldest. if len(replay) > buffer: replay.pop(0) # Randomly sample our experience replay memory minibatch = random.sample(replay, batchSize) # Get training values. X_train, y_train = process_minibatch2(minibatch, model) # Train the model on this batch. history = LossHistory() model.fit(X_train, y_train, batch_size=batchSize, verbose=0, callbacks=[history]) loss_log.append(history.losses) steps += 1 if steps % 1000 == 0: print("Step = " + str(steps), "Epsilon = " + str(epsilon)) # Update the starting state with S'. state = new_state # Decrement epsilon over time. if epsilon > 0.1 and t > observe: epsilon -= (10.0 / train_frames) print("EPSILON UPDATED", epsilon) # We died, so update stuff. if term == 1: # print("Crashed.") # Log the car's distance at this T. data_collect.append([t, car_distance]) continue # Reset. car_distance = 0 # We reached the goal, so update stuff. elif term == 2: print("Reached goal.", car_distance) # Log the car's distance at this T. data_collect.append([t, car_distance]) continue # Reset. car_distance = 0 # Save the model every 25,000 frames. if t % 25000 == 0: model.save_weights('saved-models/' + filename + '-' + str(t) + '.h5', overwrite=True) print("Saving model %s - %d" % (filename, t)) # if(keyboard.is_pressed('8')): # print("Reset Goal") # game_state.reset_goal() print(t, reward, action)
def train_net(best_action_model, params): filename = params_to_filename(params) observe = 1000 # Number of frames to observe before training. epsilon = 1 train_frames = 500000 # Number of frames to play. was 1000000 batchSize = params['batchSize'] buffer = params['buffer'] # Just stuff used below. max_car_distance = 0 car_distance = 0 t = 0 cum_rwd = 0 cum_rwd_read = 0 cum_rwd_dist = 0 cum_rwd_speed = 0 data_collect = [] replay = [] # stores tuples of (S, A, R, S'). save_init = True loss_log = [] # Create a new game instance. game_state = carmunk.GameState() # Get initial state by doing nothing and getting the state. state, new_reward, cur_speed, _, _, _ = game_state.frame_step( START_ACTION, START_SPEED, START_DISTANCE) # frame_step returns reward, state, speed #state = state_frames(state, np.array([[0, 0, 0, 0, 0, 0, 0]])) # zeroing distance readings #state = state_frames(state, np.zeros((1,NUM_SENSORS))) # zeroing distance readings # Let's time it. start_time = timeit.default_timer() # Run the frames. while t < train_frames: #time.sleep(0.5) t += 1 car_distance += 1 # Choose an action. if random.random() < epsilon or t < observe: action = np.random.randint(0, NUM_OUTPUT) # random else: # Get Q values for each action qval = best_action_model.predict(state, batch_size=1) # best_action_model was passed to this function. call it w/ current state action = (np.argmax(qval)) # best prediction # Take action, observe new state and get our treat. new_state, new_reward, new_speed, new_rwd_read, new_rwd_dist, new_rwd_speed = \ game_state.frame_step(action, cur_speed, car_distance) # Use multiple frames. #new_state = state_frames(new_state, state) # seems this is appending 2-3 moves, results # Experience replay storage. replay.append((state, action, new_reward, new_state)) # If we're done observing, start training. if t > observe: # If we've stored enough in our buffer, pop the oldest. if len(replay) > buffer: replay.pop(0) # Randomly sample our experience replay memory minibatch = random.sample(replay, batchSize) # WHY RANDOM SAMPLE? COULD TRAINING BE SPED UP BY TAKING LAST BATCHSIZE # Get training values. X_train, y_train = process_minibatch(minibatch, best_action_model) # Train the best_action_model on this batch. history = LossHistory() best_action_model.fit(X_train, y_train, batch_size=batchSize, nb_epoch=1, verbose=0, callbacks=[history]) loss_log.append(history.losses) # Update the starting state with S'. state = new_state cur_speed = new_speed cum_rwd += new_reward cum_rwd_read += new_rwd_read cum_rwd_dist += new_rwd_dist cum_rwd_speed += new_rwd_speed # Decrement epsilon over time. if epsilon > 0.1 and t > observe: epsilon -= (1 / train_frames) # We died, so update stuff. if new_reward == -500 or new_reward == -1000: # Log the car's distance at this T. data_collect.append([t, car_distance]) # Update max. if car_distance > max_car_distance: max_car_distance = car_distance # Time it. tot_time = timeit.default_timer() - start_time fps = car_distance / tot_time # Output some stuff so we can watch. print("Max: %d at %d\t eps: %f\t dist: %d\t rwd: %d\t read: %d\t dist: %d\t speed: %d\t fps: %d" % (max_car_distance, t, epsilon, car_distance, cum_rwd, \ cum_rwd_read, cum_rwd_dist, cum_rwd_speed, int(fps))) # Reset. car_distance = 0 cum_rwd = 0 cum_rwd_read = 0 cum_rwd_dist = 0 cum_rwd_speed = 0 start_time = timeit.default_timer() # Save early best_action_model, then every 20,000 frames if t % 50000 == 0: save_init = False best_action_model.save_weights('saved-best_action_models/' + filename + '-' + str(t) + '.h5', overwrite=True) print("Saving best_action_model %s - %d" % (filename, t)) # Log results after we're done all frames. log_results(filename, data_collect, loss_log)
def train_net(model, params, weights, path, trainFrames, i=10): filename = params_to_filename(params) observe = 1000 # Number of frames to observe before training. epsilon = 1 train_frames = trainFrames # Number of frames to play. batchSize = params['batchSize'] buffer = params['buffer'] # Just stuff used below. max_car_distance = 0 car_distance = 0 t = 0 data_collect = [] replay = [] # stores tuples of (S, A, R, S'). loss_log = [] # Create a new game instance. game_state = carmunk.GameState(weights, [1, 0, 0, 0]) # Get initial state by doing nothing and getting the state. _, state, temp1 = game_state.frame_step((2)) # Let's time it. start_time = timeit.default_timer() # Run the frames. while t < train_frames: t += 1 car_distance += 1 # Choose an action. if random.random() < epsilon or t < observe: action = np.random.randint(0, 3) # random #3 else: # Get Q values for each action. qval = model.predict(state) action = (np.argmax(qval)) # best #print ("action under learner ", action) # Take action, observe new state and get our treat. reward, new_state, temp2 = game_state.frame_step(action) # Experience replay storage. replay.append((state, action, reward, new_state)) # If we're done observing, start training. if t > observe: # If we've stored enough in our buffer, pop the oldest. if len(replay) > buffer: replay.pop(0) # Randomly sample our experience replay memory minibatch = random.sample(replay, batchSize) # Get training values. X_train, y_train = process_minibatch(minibatch, model) # Train the model on this batch. # history = LossHistory() actions, steploss = model.train(X_train, y_train) # loss_log.append(history.losses) # Update the starting state with S'. state = new_state # Decrement epsilon over time. if epsilon > 0.1 and t > observe: epsilon -= (1 / train_frames) # We died, so update stuff. if state[0][7] == 1: # Log the car's distance at this T. data_collect.append([t, car_distance]) # Update max. if car_distance > max_car_distance: max_car_distance = car_distance # Time it. tot_time = timeit.default_timer() - start_time fps = car_distance / tot_time # Output some stuff so we can watch. #print("Max: %d at %d\tepsilon %f\t(%d)\t%f fps" % #(max_car_distance, t, epsilon, car_distance, fps)) # Reset. car_distance = 0 start_time = timeit.default_timer() # Save the model # print(t) if t % train_frames == 0: model.save_weights('saved-models_' + path + '/evaluatedPolicies/' + str(i) + '-' + filename + '-' + str(t) + '.h5') print("Saving model %s - %d" % (filename, t))
def play(turn_model, turn_model_30, turn_model_50, turn_model_70, avoid_model, acquire_model, acquire_model_30, acquire_model_50, acquire_model_70, hunt_model, pack_model, params): total_frame_ctr = 0 crash_frame_ctr = 0 replay_frame_ctr = 0 crash_ctr = 0 acquire_ctr = 0 cum_speed = 0 stop_ctr = avoid_ctr = acquire_ctr = 0 cur_speeds = [] for i in range(NUM_DRONES): cur_speeds.append(START_SPEED) # initialize drone state holders turn_states = np.zeros( [NUM_DRONES, TURN_TOTAL_SENSORS * TURN_STATE_FRAMES]) avoid_states = np.zeros( [NUM_DRONES, AVOID_TOTAL_SENSORS * AVOID_STATE_FRAMES]) acquire_states = np.zeros( [NUM_DRONES, ACQUIRE_NUM_SENSOR * ACQUIRE_STATE_FRAMES]) hunt_states = np.zeros( [NUM_DRONES, HUNT_TOTAL_SENSORS * HUNT_STATE_FRAMES]) drone_states = np.zeros( [NUM_DRONES, DRONE_TOTAL_SENSOR * PACK_STATE_FRAMES]) # create game instance game_state = carmunk.GameState() # get initial state(s) turn_state, avoid_state, acquire_state, hunt_state, drone_state, reward, cur_speed = \ game_state.frame_step(START_DRONE_ID, START_TURN_ACTION, START_SPEED_ACTION, START_PACK_ACTION, START_SPEED, START_DISTANCE, 1) # initialize frame states if cur_mode in [TURN, AVOID, HUNT, PACK]: for i in range(NUM_DRONES): turn_states[i] = state_frames( turn_state, np.zeros((1, TURN_TOTAL_SENSORS * TURN_STATE_FRAMES)), TURN_TOTAL_SENSORS, TURN_STATE_FRAMES) if cur_mode in [AVOID, HUNT, PACK]: for i in range(NUM_DRONES): avoid_states[i] = state_frames( avoid_state, np.zeros((1, AVOID_TOTAL_SENSORS * AVOID_STATE_FRAMES)), AVOID_TOTAL_SENSORS, AVOID_STATE_FRAMES) if cur_mode in [ACQUIRE, HUNT, PACK]: for i in range(NUM_DRONES): acquire_states[i] = state_frames( acquire_state, np.zeros((1, ACQUIRE_NUM_SENSOR * ACQUIRE_STATE_FRAMES)), ACQUIRE_NUM_SENSOR, ACQUIRE_STATE_FRAMES) if cur_mode in [HUNT, PACK]: for i in range(NUM_DRONES): hunt_states[i] = state_frames( hunt_state, np.zeros((1, HUNT_TOTAL_SENSORS * HUNT_STATE_FRAMES)), HUNT_TOTAL_SENSORS, HUNT_STATE_FRAMES) if cur_mode == PACK: for i in range(NUM_DRONES): drone_states[i] = state_frames( drone_state, np.zeros((1, DRONE_TOTAL_SENSOR * PACK_STATE_FRAMES)), DRONE_TOTAL_SENSOR, PACK_STATE_FRAMES) #pack_state = state_frames(drone_state, # np.zeros((1, PACK_TOTAL_SENSORS * PACK_STATE_FRAMES)), # PACK_TOTAL_SENSORS, PACK_STATE_FRAMES) pack_state = state_frames(drone_state, np.zeros((1, 30)), 10, 4) # Move. while True: total_frame_ctr += 1 crash_frame_ctr += 1 replay_frame_ctr += 1 #time.sleep(1) for drone_id in range( NUM_DRONES): # NUM_DRONES = 1, unless you're in PACK mode speed_action = START_SPEED_ACTION # choose action if cur_mode == TURN: turn_action, turn_model = set_turn_action( False, cur_speeds[drone_id], np.array([turn_states[drone_id]])) else: if cur_mode in [AVOID, HUNT, PACK]: turn_action, turn_model = set_turn_action( False, cur_speeds[drone_id], np.array([turn_states[drone_id]])) if cur_mode == AVOID: speed_action = set_avoid_action( False, turn_action, np.array([avoid_states[drone_id]])) else: if cur_mode in [HUNT, PACK]: speed_action = set_avoid_action( False, turn_action, np.array([avoid_states[drone_id]])) if cur_mode == ACQUIRE: acquire_action, acquire_model = set_acquire_action( False, cur_speeds[drone_id], np.array([acquire_states[drone_id, ]])) turn_action = acquire_action else: acquire_action, acquire_model = set_acquire_action( False, cur_speeds[drone_id], np.array([acquire_states[drone_id, ]])) if cur_mode == HUNT: hunt_action, turn_action, speed_action = set_hunt_action( False, cur_speeds[drone_id], turn_action, speed_action, acquire_action, np.array([hunt_states[drone_id, ]])) else: hunt_action, turn_action, speed_action = set_hunt_action( False, cur_speeds[drone_id], turn_action, speed_action, acquire_action, np.array([hunt_states[drone_id, ]])) if cur_mode == PACK and ( total_frame_ctr == 1 or replay_frame_ctr % PACK_EVAL_FRAMES == 0) and drone_id == 0: # get 1 pack action for each set of drones on first drone pack_action = set_pack_action( False, pack_state) # pass action, receive new state, reward new_turn_state, new_avoid_state, new_acquire_state, new_hunt_state, new_drone_state, new_reward, new_speed = game_state.frame_step( drone_id, turn_action, speed_action, pack_action, cur_speeds[drone_id], total_frame_ctr, replay_frame_ctr) # append (horizontally) historical states for learning speed. if cur_mode in [TURN, AVOID, HUNT, PACK]: new_turn_state = state_frames( new_turn_state, np.array([turn_states[drone_id]]), TURN_TOTAL_SENSORS, TURN_STATE_FRAMES) if cur_mode in [AVOID, HUNT, PACK]: new_avoid_state = state_frames( new_avoid_state, np.array([avoid_states[drone_id]]), AVOID_TOTAL_SENSORS, AVOID_STATE_FRAMES) if cur_mode in [ACQUIRE, HUNT, PACK]: new_acquire_state = state_frames( new_acquire_state, np.array([acquire_states[drone_id]]), ACQUIRE_NUM_SENSOR, ACQUIRE_STATE_FRAMES) if cur_mode in [HUNT, PACK]: new_hunt_state = state_frames( new_hunt_state, np.array([hunt_states[drone_id]]), HUNT_TOTAL_SENSORS, HUNT_STATE_FRAMES) if cur_mode == PACK and (total_frame_ctr == 1 or replay_frame_ctr % PACK_EVAL_FRAMES == 0): if drone_id == 0: # for 1st drone, pack state = drone state new_pack_state = new_drone_state pack_rwd = new_reward else: # otherwise, append drone record to prior drone state new_pack_state = state_frames(new_pack_state, new_drone_state, DRONE_TOTAL_SENSOR, PACK_STATE_FRAMES - 1) pack_rwd += new_reward new_drone_state = state_frames( new_drone_state, np.array([drone_states[drone_id]]), DRONE_TOTAL_SENSOR, PACK_STATE_FRAMES) if drone_id == (NUM_DRONES - 1): # for last drone build pack record if total_frame_ctr == 1: pack_state = np.zeros( (1, PACK_TOTAL_SENSORS * PACK_STATE_FRAMES)) new_pack_state = state_frames( new_pack_state, pack_state, PACK_TOTAL_SENSORS, PACK_STATE_FRAMES ) #may need to add 1 to PACK_STATE_FRAMES # Update the starting state with S'. if cur_mode in [TURN, AVOID, HUNT, PACK]: turn_states[drone_id] = new_turn_state if cur_mode in [AVOID, HUNT, PACK]: avoid_states[drone_id] = new_avoid_state if cur_mode in [ACQUIRE, HUNT, PACK]: acquire_states[drone_id] = new_acquire_state if cur_mode in [HUNT, PACK]: hunt_states[drone_id] = new_hunt_state if cur_mode == PACK and (total_frame_ctr == 1 or replay_frame_ctr % PACK_EVAL_FRAMES == 0): drone_states[drone_id] = new_drone_state if drone_id == (NUM_DRONES - 1): pack_state = new_pack_state replay_frame_ctr = 0 cur_speeds[drone_id] = new_speed # give status if new_reward == -500 or new_reward == -1000: crash_ctr += 1 print("crashes", crash_ctr, "frames", total_frame_ctr) elif new_reward == 1000: acquire_ctr += 1 print("acquisitions:", acquire_ctr, "frames", total_frame_ctr) if total_frame_ctr % 5000 == 0: print("***** total frames:", total_frame_ctr) print("***** frames between crashes:", int(total_frame_ctr / crash_ctr)) if cur_mode in [ACQUIRE, HUNT, PACK]: print("***** frames / acquisition:", int(total_frame_ctr / acquire_ctr))
def trainNetwork(model, args): filename = 'rl' data_collect = [] loss_log = [] car_distance = 0 r_t_sum = 0 # open up a game state to communicate with emulator game_state = carmunk.GameState() # store the previous observations in replay memory D = deque() # get the first state by doing nothing and preprocess the image to 80x80x4 do_nothing = np.zeros(ACTIONS) do_nothing[0] = 1 x_t, r_0, terminal = game_state.frame_step(do_nothing) x_t = skimage.color.rgb2gray(x_t) x_t = skimage.transform.resize(x_t, (80, 80)) x_t = skimage.exposure.rescale_intensity(x_t, out_range=(0, 255)) s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) #print (s_t.shape) #In Keras, need to reshape s_t = s_t.reshape(1, s_t.shape[0], s_t.shape[1], s_t.shape[2]) #1*80*80*4 if args['mode'] == 'Run': OBSERVE = 999999999 #We keep observe, never train epsilon = FINAL_EPSILON print("Now we load weight") model.load_weights("model.h5") adam = Adam(lr=LEARNING_RATE) model.compile(loss='mse', optimizer=adam) print("Weight load successfully") else: #We go to training mode OBSERVE = OBSERVATION epsilon = INITIAL_EPSILON begin = strftime("%a, %d %b %Y %H:%M:%S", gmtime()) t = 0 while t < TOTAL_FRAMES: loss = 0 Q_sa = 0 action_index = 0 r_t = 0 a_t = np.zeros([ACTIONS]) car_distance += 1 #choose an action epsilon greedy if t % FRAME_PER_ACTION == 0: if random.random() <= epsilon: #print("----------Random Action----------") action_index = random.randrange(ACTIONS) a_t[action_index] = 1 else: q = model.predict( s_t) #input a stack of 4 images, get the prediction max_Q = np.argmax(q) action_index = max_Q a_t[max_Q] = 1 #We reduced the epsilon gradually if epsilon > FINAL_EPSILON and t > OBSERVE: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE #run the selected action and observed next state and reward x_t1_colored, r_t, terminal = game_state.frame_step(a_t) r_t_sum += r_t x_t1 = skimage.color.rgb2gray(x_t1_colored) x_t1 = skimage.transform.resize(x_t1, (80, 80), mode='reflect') x_t1 = skimage.exposure.rescale_intensity(x_t1, out_range=(0, 255)) x_t1 = x_t1.reshape(1, x_t1.shape[0], x_t1.shape[1], 1) #1x80x80x1 s_t1 = np.append(x_t1, s_t[:, :, :, :3], axis=3) if terminal: # Log the car's distance to get the GOAL. data_collect.append([t, car_distance, r_t_sum]) car_distance = 0 r_t_sum = 0 # store the transition in D D.append((s_t, action_index, r_t, s_t1, terminal)) if len(D) > REPLAY_MEMORY: D.popleft() #only train if done observing if t > OBSERVE: #sample a minibatch to train on minibatch = random.sample(D, BATCH) inputs = np.zeros((BATCH, s_t.shape[1], s_t.shape[2], s_t.shape[3])) #32, 80, 80, 4 #print (inputs.shape) targets = np.zeros((inputs.shape[0], ACTIONS)) #32, 2 #Now we do the experience replay for i in range(0, len(minibatch)): state_t = minibatch[i][0] action_t = minibatch[i][1] #This is action index reward_t = minibatch[i][2] state_t1 = minibatch[i][3] terminal = minibatch[i][4] # if terminated, only equals reward inputs[i:i + 1] = state_t #I saved down s_t targets[i] = model.predict( state_t) # Hitting each buttom probability Q_sa = model.predict(state_t1) if terminal: targets[i, action_t] = reward_t else: targets[i, action_t] = reward_t + GAMMA * np.max(Q_sa) # targets2 = normalize(targets) loss += model.train_on_batch(inputs, targets) #loss_log.append(model.train_on_batch(inputs, targets)) s_t = s_t1 t = t + 1 # save progress every 1000 iterations if t % 1000 == 0: #print("Now we save model") model.save_weights("model.h5", overwrite=True) with open("model.json", "w") as outfile: json.dump(model.to_json(), outfile) # Log results after we're done all frames. log_results(filename, data_collect, loss_log) # print info state = "" if t <= OBSERVE: state = "Observe / Running" elif t > OBSERVE: state = "Training" else: state = "Execution" if t % 10000 == 0: print("TIMESTEP", t, "/ STATE", state, \ "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, \ "/ Q_MAX " , np.max(Q_sa), "/ Loss ", loss, "/ Reward_sum ", r_t_sum, "/ car_distance ", car_distance) print("Episode finished!") print("************************") end = strftime("%a, %d %b %Y %H:%M:%S", gmtime()) print("=============== Total training time =====================") print(begin) print(end) print("=========================================================") # Log results after we're done all frames. log_results(filename, data_collect, loss_log)