def train_obstacle_avoid_policy(obs_id): """ Train a NN control policy that avoids obstacles :return: """ # Create dictionary for each instance of rover and corresponding NN and EA population rd = RoverDomain() rd.use_saved_poi_configuration() rd.use_saved_obstacle_config() rd.use_saved_rover_training_configs() rov = Rover(0) ea = Ccea(rov.n_inputs, rov.n_hnodes, rov.n_outputs) ea.create_new_population() for gen in range(generations): ea.reset_fitness() if gen == 0: policy_id = 0 else: policy_id = ea.n_elites while policy_id < ea.pop_size: for config_id in range(n_configs): rov.reset_rover(rd.rover_configs[config_id]) # Reset rover to initial conditions rov.get_network_weights(ea.population["pop{0}".format(policy_id)]) # Apply network weights from CCEA rd.update_rover_path(rov, -1) # Record starting position of each rover for step_id in range(n_steps): rov.rover_sensor_scan(rd.pois, rd.obstacles, n_poi, n_obstacles) rov.step(rd.world_x, rd.world_y) rd.update_rover_path(rov, step_id) # Update fitness of policies using reward information ea.fitness[policy_id] += avoid_obstacle_reward(obs_id, rd.obstacles, rov, rd.world_x, rd.world_y) ea.fitness[policy_id] /= n_configs policy_id += 1 # Choose new parents and create new offspring population ea.down_select() best_pol_id = np.argmax(ea.fitness) best_policy = ea.population["pop{0}".format(best_pol_id)] return best_policy
def multi_reward_learning_single(): """ Trains the brain to choose between trained rover control policies :return: """ policies = {} if train_new_policies == 1: policies = train_policies() else: pol_id = 0 for poi_id in range(n_poi): policies['Policy{0}'.format(pol_id)] = use_saved_policy('TowardsPOI{0}'.format(poi_id)) pol_id += 1 policies['Policy{0}'.format(pol_id)] = use_saved_policy('AwayFromPOI{0}'.format(poi_id)) pol_id += 1 for obs_id in range(n_obstacles): policies['Policy{0}'.format(pol_id)] = use_saved_policy("AvoidObstacle{0}".format(obs_id)) pol_id += 1 print("Training The Brain") visualizer_rover_path = np.zeros((s_runs, (n_steps + 1), 3)) for srun in range(s_runs): # Perform statistical runs print("Run: %i" % srun) # Create dictionary for each instance of rover and corresponding NN and EA population rd = RoverDomain() rd.use_saved_poi_configuration() rd.use_saved_obstacle_config() rd.use_saved_rover_training_configs() rd.use_saved_rover_test_config() br = Brain() rov = Rover(0) ea = Ccea(br.n_inputs, br.n_hnodes, br.n_outputs) ea.create_new_population() # Train the Brain reward_history = [] for gen in range(brain_gen): ea.reset_fitness() if gen == 0: policy_id = 0 else: policy_id = ea.n_elites while policy_id < ea.pop_size: # Each policy in CCEA is tested in teams for config_id in range(n_configs): rov.reset_rover(rd.rover_configs[config_id]) # Reset rover to initial conditions br.get_weights(ea.population["pop{0}".format(policy_id)]) # Apply network weights from CCEA rd.update_rover_path(rov, -1) # Record starting position of each rover for step_id in range(n_steps): rov.rover_sensor_scan(rd.pois, rd.obstacles, n_poi, n_obstacles) state_vector = [] for bracket in range(8): state_vector.append(rov.sensor_readings[bracket]) # Pick policy using brain's decision br.get_inputs(state_vector) br.get_outputs() rov_policy = pick_policy(br.output_layer[0, 0], policies) rov.get_network_weights(rov_policy) rov.step(rd.world_x, rd.world_y) rd.update_rover_path(rov, step_id) # Update fitness of policies using reward information global_reward = rd.step_based_global_reward(rov) ea.fitness[policy_id] += global_reward ea.fitness[policy_id] /= n_configs policy_id += 1 # Testing Phase (test best policies found so far) --------------------------------------------------------- rov.reset_rover(rd.rover_test_config) # Reset rover to initial conditions policy_id = np.argmax(ea.fitness) br.get_weights(ea.population["pop{0}".format(policy_id)]) # Apply best set of weights to network rd.update_rover_path(rov, -1) for step_id in range(n_steps): rov.rover_sensor_scan(rd.pois, rd.obstacles, n_poi, n_obstacles) state_vector = [] for bracket in range(8): state_vector.append(rov.sensor_readings[bracket]) # Pick policy using brain's decision br.get_inputs(state_vector) br.get_outputs() rov_policy = pick_policy(br.output_layer[0, 0], policies) rov.get_network_weights(rov_policy) rov.step(rd.world_x, rd.world_y) rd.update_rover_path(rov, step_id) global_reward = calc_global_reward(rd.rover_path, rd.pois, rd.obstacles) reward_history.append(global_reward) if gen == (brain_gen - 1): # Save path at end of final generation visualizer_rover_path[srun] = rd.rover_path.copy() save_rover_path(visualizer_rover_path) # Choose new parents and create new offspring population ea.down_select() save_reward_history(reward_history, "Multi_Reward.csv") run_visualizer()
def rovers_dpp_rewards(reward_type): """ Train rovers using the D++ reward :param reward_type: :return: """ p = get_parameters() rd = RoverDomain(p) # Create dictionary for each instance of rover and corresponding NN and EA population rv = {} for rv_id in range(p["n_rovers"]): rv["AG{0}".format(rv_id)] = Rover(p, rv_id, rd.rover_positions[rv_id]) rv["EA{0}".format(rv_id)] = Ccea(p) print("Reward Type: ", reward_type) print("Coupling Requirement: ", p["c_req"]) for srun in range(p["s_runs"]): # Perform statistical runs print("Run: %i" % srun) # Reset CCEA and NN new stat run for rv_id in range( p["n_rovers"]): # Randomly initialize ccea populations rv["EA{0}".format(rv_id)].create_new_population() reward_history = [] for gen in range(p["generations"]): for rv_id in range(p["n_rovers"]): rv["EA{0}".format(rv_id)].select_policy_teams() for team_number in range( p["pop_size"]): # Each policy in CCEA is tested in teams for rv_id in range(p["n_rovers"]): rv["AG{0}".format(rv_id)].reset_rover( ) # Reset rover to initial conditions policy_id = int( rv["EA{0}".format(rv_id)].team_selection[team_number]) weights = rv["EA{0}".format(rv_id)].population[ "pop{0}".format(policy_id)] rv["AG{0}".format(rv_id)].get_network_weights( weights) # Apply network weights from CCEA rd.update_rover_path( rv["AG{0}".format(rv_id)], rv_id, -1) # Record starting position of each rover for step_id in range(p["n_steps"]): # Rover scans environment and constructs state vector for rv_id in range(p["n_rovers"]): rv["AG{0}".format(rv_id)].rover_sensor_scan( rv, rd.pois, p["n_rovers"], p["n_poi"]) # Rover processes scan information and acts for rv_id in range(p["n_rovers"]): rv["AG{0}".format(rv_id)].step(p["x_dim"], p["y_dim"]) rd.update_rover_path(rv["AG{0}".format(rv_id)], rv_id, step_id) # Update fitness of policies using reward information global_reward = calc_global_reward(p, rd.rover_path, rd.pois) dpp_rewards = calc_dpp_reward(p, rd.rover_path, rd.pois, global_reward) for rv_id in range(p["n_rovers"]): policy_id = int( rv["EA{0}".format(rv_id)].team_selection[team_number]) rv["EA{0}".format( rv_id)].fitness[policy_id] = dpp_rewards[rv_id] # Testing Phase (test best policies found so far) --------------------------------------------------------- for rv_id in range(p["n_rovers"]): rv["AG{0}".format( rv_id)].reset_rover() # Reset rover to initial conditions policy_id = np.argmax(rv["EA{0}".format(rv_id)].fitness) weights = rv["EA{0}".format(rv_id)].population["pop{0}".format( policy_id)] rv["AG{0}".format(rv_id)].get_network_weights( weights) # Apply best set of weights to network rd.update_rover_path(rv["AG{0}".format(rv_id)], rv_id, -1) for step_id in range(p["n_steps"]): # Rover scans environment and constructs state vector for rv_id in range(p["n_rovers"]): rv["AG{0}".format(rv_id)].rover_sensor_scan( rv, rd.pois, p["n_rovers"], p["n_poi"]) # Rover processes information from scan and acts for rv_id in range(p["n_rovers"]): rv["AG{0}".format(rv_id)].step(p["x_dim"], p["y_dim"]) rd.update_rover_path(rv["AG{0}".format(rv_id)], rv_id, step_id) global_reward = calc_global_reward(p, rd.rover_path, rd.pois) reward_history.append(global_reward) if gen == (p["generations"] - 1): # Save path at end of final generation save_rover_path(p, rd.rover_path) # Choose new parents and create new offspring population for rv_id in range(p["n_rovers"]): rv["EA{0}".format(rv_id)].down_select() save_reward_history(reward_history, "DPP_Reward.csv") run_visualizer(p)
def main(): sc = sequenceClassifier() ag = Agent() cc = Ccea() nn = NeuralNetwork() if p["create_new_sets"] == 1: sc.create_training_set() sc.save_training_set() sc.create_test_set() sc.save_test_set() else: sc.load_training_set() sc.load_test_set() for s in range(p["s_runs"]): print("Stat Run: ", s) # Training training_reward_history = [] test_reward_history = [] state_vec = np.ones(p["n_inputs"]) cc.create_new_population() for gen in range(p["generations"]): print("Gen: ", gen) pop_id = cc.n_elites while pop_id < p["pop_size"]: # Test each set of weights in EA nn.reset_nn() nn.get_weights(cc.population["pop{0}".format(pop_id)]) fitness_score = 0.0 for seq in range(p["train_set_size"]): ag.reset_mem_block() nn.clear_outputs() seq_len = len(sc.training_set["set{0}".format(seq)]) current_sequence = sc.training_set["set{0}".format( seq)].copy() for num in range(seq_len): state_vec[0] = current_sequence[num] nn.run_neural_network(state_vec, ag.mem_block) ag.update_memory(nn.wgate_outputs, nn.encoded_memory) if nn.out_layer[0] < 0.5 and sc.training_set_answers[ seq, 2] == -1: fitness_score += 1 elif nn.out_layer[0] >= 0.5 and sc.training_set_answers[ seq, 2] == 1: fitness_score += 1 cc.fitness[pop_id] = fitness_score / p["train_set_size"] pop_id += 1 # Testing nn.reset_nn() state_vec = np.ones(p["n_inputs"]) best_pol_id = np.argmax( cc.fitness) # Find the best policy in the population currently nn.get_weights(cc.population["pop{0}".format(best_pol_id)]) test_reward = 0.0 for seq in range(p["test_set_size"]): ag.reset_mem_block() nn.clear_outputs() seq_len = len(sc.test_set["set{0}".format(seq)]) current_sequence = sc.test_set["set{0}".format(seq)].copy() for num in range(seq_len): state_vec[0] = current_sequence[num] nn.run_neural_network(state_vec, ag.mem_block) ag.update_memory(nn.block_output, nn.wgate_outputs) if nn.out_layer[0] < 0.5 and sc.training_set_answers[seq, 2] == -1: test_reward += 1 elif nn.out_layer[0] >= 0.5 and sc.training_set_answers[ seq, 2] == 1: test_reward += 1 test_reward_history.append(test_reward / p["test_set_size"]) training_reward_history.append(max(cc.fitness)) cc.down_select() save_reward_history(training_reward_history, "Training_Fitness.csv") save_reward_history(test_reward_history, "Test_Reward.csv")