def balance_ball_nn(): nao.initial_setup() nao.set_up_to_hold_tray() nao.hands_open() raw_input("Press enter to continue: ") nao.hands_grab() nao.continually_update_ball_information(0.01) net = nf.load_network() wait_time = 0.0 if wait_time > 0: run_time = int(5 / wait_time) else: run_time = 1000 action = -1 nn_out = [0, 0, 0] inputs = (0, 0, 0) for i in range(0, run_time): pos = nao.ball_pos_lr # Get the NN inputs based off the state of the ball and tray vel = nao.ball_vel_lr ang = nao.ball_ang_lr ball_values = (pos, vel, ang) max_values = ( 0.15, 0.3, 0.1 ) # The maximum possible values for each of the 3 properties bin_values = (10, 8, 10) # The number of bins of each value p, v, a = nf.inputs_to_bins(ball_values, max_values, bin_values) prev_inputs = copy.copy(inputs) inputs = (p, v, ang) # ang not a because it is already in the bin form if action >= 0: reward = nf.get_reward_nn( inputs, max_values) # Reward for the current state future_action_value = nf.get_nn_output( net, inputs) # Outputs for acting from current state # nn_out holds the output for the previous state # prev_inputs holds the previous state # action holds the action taken from the previous state to get to current state # reward is reward for being in current state net = nf.update_nn(net, prev_inputs, action, nn_out, reward, future_action_value) nn_out = nf.get_nn_output(net, inputs) # Feed to the NN action = np.argmax(nn_out) move = 0 if action == 0: # [1, 0, -1] move = 1 elif action == 2: move = -1 nao.interpolate_angles_relative_lr(move, 10, 2, 7) nao.go_to_interpolated_angles_lr() print ball_values, inputs, move time.sleep(wait_time) nao.hands_open() time.sleep(2) nao.rest()
def balance_ball_q_mat(): nao.initial_setup() nao.set_up_to_hold_tray() nao.hands_open() raw_input("Press enter to continue: ") nao.hands_grab() nao.continually_update_ball_information(0.1) q_mat = nf.load_q_matrix(bin_values[0], bin_values[1], bin_values[2], act=2, is_nao=True, is_delay=True) #q_mat = nf.normalise_whole_q(q_mat) explore_rate = 1.0 if learn: explore_rate = 1.0 wait_time = 0.1 if wait_time > 0: run_time = int(50 / wait_time) else: run_time = 500 ang_val = nao.angle_lr_interpolation update_experience = True record_experience = True save_exp = True if update_experience: experiences = nf.load_exp(bin_values[0], bin_values[1], bin_values[2], 2) else: experiences = np.empty( (bin_values[0], bin_values[1], bin_values[2], 1), dtype=object) # Need dtype=object for i in range(len(experiences)): for j in range(len(experiences[i])): for k in range(len(experiences[i][j])): experiences[i][j][k][0] = [] experiences = nf.clean_experiences(experiences) prev_state = (-1, -1, -1) prev_vals = (-1, -1, -1) action = 0 for i in range(0, run_time): pos = nao.ball_pos_lr # Get the NN inputs based off the state of the ball and tray vel = nao.ball_vel_lr ang = nao.ball_ang_lr ball_values = (pos, vel, ang) while ball_values == prev_vals: # Make sure have actually changed print("SAME") pos = nao.ball_pos_lr vel = nao.ball_vel_lr ang = nao.ball_ang_lr ball_values = (pos, vel, ang) #if abs(vel) < 0.001: # vel = 0 p, v, _ = nf.inputs_to_bins(ball_values, max_values, bin_values) print "State:", int(p), int(v), ang, q_mat[int(p), int(v), ang] reward = nf.get_reward(p, v, bin_values[0], bin_values[1]) * 5 # Update Q matrix now that we're in new state if learn and i > 1: # prev_state != (-1, -1, -1): # True because do want to learn curr_state = (int(p), int(v), int(ang)) # Action is remembered from previous iteration q_mat = nf.update_q(q_mat, prev_state, curr_state, action, reward, 0.4, 0.99, prnt=True) print "Prev:", prev_state, "Curr:", curr_state, "Act:", action, "Rew", reward, "\n\n" if record_experience and i > 1: curr_state = (int(p), int(v), ang) exp = {"new_state": curr_state, "action": action, "reward": reward} print( len(experiences[prev_state[0], prev_state[1], prev_state[2], 0])) if len(experiences[prev_state[0], prev_state[1], prev_state[2], 0]) < 20: experiences[prev_state[0], prev_state[1], prev_state[2], 0] = np.append( experiences[prev_state[0], prev_state[1], prev_state[2], 0], exp) print(experiences[prev_state[0], prev_state[1], prev_state[2]][0]) # experience = { # "old_state": prev_state, # "action": action, # "new_state": curr_state, # "reward": reward # } # experiences.append(experience) if record_experience and random.random( ) < 0.6: # Only update action 1/2 of the time - allows for series of actions action = nf.get_action(q_mat, int(p), int(v), int(ang), bin_values[0], bin_values[1]) #action = np.argmax(q_mat[int(p)][int(v)][int(ang)]) if random.random() < 0.5: action = (action + 1) % 2 elif not record_experience: action = nf.get_action(q_mat, int(p), int(v), int(ang), bin_values[0], bin_values[1]) #action = np.argmax(q_mat[int(p)][int(v)][int(ang)]) if learn and random.random() < explore_rate: action = random.randint(0, 1) print("RANDOM") if record_experience and int(p) == 0 and int(ang) > 7: action = 1 elif record_experience and int(p) == 11 and int(ang) < 2: action = 0 move = 0 if action == 1: # Clockwise move = -1 elif action == 0: # Anticlockwise move = 1 print "Action", move #print "ER", explore_rate ang_val += move if ang_val < 0: ang_val = 0 if ang_val >= nao.num_angles: ang_val = nao.num_angles - 1 #nao.interpolate_angles_relative_lr(move, bin_values[2], 0, 0) # 2, bin_values[2]-3) nao.interpolate_angles_fixed_lr(ang_val, bin_values[2] - 1) nao.ball_ang_lr = ang_val nao.go_to_interpolated_angles_lr() #print "\nOld state:", int(p), int(v), int(ang), "\nAction:", move # Remember values for next loop prev_vals = (pos, vel, ang) prev_state = (int(p), int(v), int(ang)) time.sleep(wait_time) # prev_state = (int(p), int(v), int(nao.ball_ang_lr)) # prev_vals = (pos, vel, nao.ball_ang_lr) # new_ball_values = (nao.ball_pos_lr, nao.ball_vel_lr, nao.ball_ang_lr) # p2, v2, _ = nf.inputs_to_bins(new_ball_values, max_values, bin_values) # curr_state = (int(p2), int(v2), int(new_ball_values[2])) # # Update the Q matrix # if learn and i > 1: # prev_state != (-1, -1, -1): # True because do want to learn # reward = nf.get_reward(p, v, bin_values[0], bin_values[1]) # q_mat = nf.update_q(q_mat, prev_state, curr_state, action, reward, 0.3, 0.99, prnt=True) # print "Prev:", prev_state, "Curr:", curr_state, "Act:", action, "Rew", reward, "\n\n" #print "\n\n", ang, ang_val, reward #prev_state = curr_state freq = run_time / 4 if i % freq == freq - 1: explore_rate /= 2.0 nao.hands_open() time.sleep(2) nao.rest() nf.save_q(q_mat, bin_values[0], bin_values[1], bin_values[2], 2, True) if save_exp: sumup = 0 nf.save_exp(experiences, bin_values[0], bin_values[1], bin_values[2], max_values[0], max_values[1], max_values[2], 2) for i in range(0, len(experiences)): for j in range(0, len(experiences[i])): for k in range(0, len(experiences[i][j])): print i, j, k, experiences[i][j][k][0] sumup += len(experiences[i][j][k][0]) print "num experiences:", sumup
def balance_ball_q_mat(nao, wait_time=0.0, run_time=300, ball_update=0.01, delay=False, iser=False, prnt=False): """ Balance the ball on the nao robot using q-matrix to make decisions ip - the ip address of the nao robot to connect to delay - whether to load the q matrix with delay or not wait_time - time to wait between each action run_time = total number of actions to make """ bin_values = __get_bin_values_q() max_values = __get_max_values() nao.num_angles = bin_values[2] # Set up the nao nao.initial_setup() nao.set_up_to_hold_tray() # move hands out in front nao.hands_open() # Open hands raw_input( "Press enter to continue: ") # Wait so the user can position tray nao.hands_grab() # Close the hands and grab the tray nao.continually_update_ball_information( ball_update ) # update the current recorded position of the ball every ball_update seconds q_mat = nf.load_q_matrix(bin_values[0], bin_values[1], bin_values[2], act=2, is_nao=False, is_delay=delay, is_er=iser) ang_val = nao.angle_lr_interpolation prev_state = (-1, -1, -1) prev_vals = (-1, -1, -1) action = 0 for i in range(0, run_time): pos = nao.ball_pos_lr # Get the NN inputs based off the state of the ball and tray vel = nao.ball_vel_lr ang = nao.ball_ang_lr ball_values = (pos, vel, ang) while ball_values == prev_vals: # Make sure have actually changed pos = nao.ball_pos_lr vel = nao.ball_vel_lr ang = nao.ball_ang_lr ball_values = (pos, vel, ang) if abs(vel) < 0.05: # Stop flickering between two bins vel = 0 p, v, a = nf.inputs_to_bins(ball_values, max_values, bin_values) p = int(p) v = int(v) a = int(a) action = nf.get_action(q_mat, p, v, a, bin_values[0], bin_values[1], consensus=False) move = 0 if action == 1: # Anticlockwise move = 1 elif action == 0: # Clockwise move = -1 if prnt: print "State:", p, v, a, "q_mat:", q_mat[p, v, a], "Action:", move ang_val += move if ang_val < 0: ang_val = 0 if ang_val >= nao.num_angles: ang_val = nao.num_angles - 1 nao.interpolate_angles_relative_lr(move, bin_values[2], 2, 7) nao.ball_ang_lr = ang_val nao.go_to_interpolated_angles_lr() # Remember values for next loop prev_vals = (pos, vel, ang) prev_state = (p, v, a) time.sleep(wait_time) nao.record_current_state( pos, vel, ang, move) # Record the data for plotting performance nao.hands_open() time.sleep(2) nao.rest() nf.save_q(q_mat, bin_values[0], bin_values[1], bin_values[2], 2, True) nao.save_state_data(1, bin_values[0], bin_values[1], bin_values[2], max_values[0], max_values[1], max_values[2], "Performance on the nao with q-matrix", nao.ip)
def balance_ball_input(nao, append_q=True): bin_values = __get_bin_values_nn() max_values = __get_max_values() nao.num_angles = bin_values[2] nao.initial_setup() nao.set_up_to_hold_tray() nao.hands_open() raw_input("Press enter to continue: ") nao.hands_grab() nao.continually_update_ball_information(0.05) balance = True training_data_json = [] filename = str(bin_values[0]) + "_" + str(bin_values[1]) + "_" + str( bin_values[2]) file_loc = config["other"][ "nn_train_data"] + "/training_data_nao_" + filename + ".json" if append_q: # If data is already existing for the current parameters, update that. wd = os.getcwd() wd = wd.replace("\\", "/") # Since slash directions are different if os.path.exists(file_loc): print("UPDATE EXISTING DATA") load_json = open(file_loc, "r") training_data_json = json.load(load_json)["data"] else: # Data set does not exist for current parameters print("NEW DATA SET") training_data_json = [] else: training_data_json = [] while balance: move = getch() action = 0 output = [0, 0, 0] record = True if move == 'k': action = 1 output = [1, 0, 0] elif move == 'l': action = -1 output = [0, 0, 1] elif move == ' ': balance = False elif move == 'o': action = 1 record = False elif move == 'p': action = -1 record = False nao.interpolate_angles_relative_lr(action, 10) #, 2, 7) nao.go_to_interpolated_angles_lr() p = nao.ball_pos_lr v = nao.ball_vel_lr a = nao.ball_ang_lr if balance and record: new_data = { "pos": p, "vel": v, "ang": a, "out1": output[0], "out2": output[1], "out3": output[2] } training_data_json.append(new_data) p, v, a = nf.inputs_to_bins((p, v, a), max_values, bin_values) print p, v, a nao.hands_open() time.sleep(2) nao.rest() json_output = open(file_loc, 'w') json_data = { "metadata": [{ "num_pos": bin_values[0], "num_vel": bin_values[1], "num_ang": bin_values[2], "max_pos": max_values[0], "max_vel": max_values[1], "max_ang": max_values[2] }], "data": training_data_json, } json.dump(json_data, json_output)
def balance_ball_nn(nao, only_two_actions=True, qnn=False, trained_on_nao=False, wait_time=0.0, run_time=300, ball_update=0.01): """ Balance the ball on the nao robot using the neural network to make decisions only_two_actions - Only use the two actions of tilt left and tilt right qnn - Update the network as training goes on """ bin_values = __get_bin_values_nn() max_values = __get_max_values() nao.num_angles = bin_values[2] nao.initial_setup() nao.set_up_to_hold_tray() nao.hands_open() raw_input("Press enter to continue: ") nao.hands_grab() nao.continually_update_ball_information(ball_update) net = nf.load_network(trained_on_nao) #max_values = (0.15, 0.3, 0.1) # The maximum possible values for each of the 3 properties #bin_values = (10, 8, 10) # The number of bins of each value action = -1 nn_out = [0, 0, 0] inputs = (0, 0, 0) for i in range(0, run_time): pos = nao.ball_pos_lr # Get the NN inputs based off the state of the ball and tray vel = nao.ball_vel_lr ang = nao.ball_ang_lr if abs(vel) < 0.05: # Stop flickering between two bins vel = 0 ball_values = (pos, vel, ang) p, v, a = nf.inputs_to_bins(ball_values, max_values, bin_values) prev_inputs = copy.copy( inputs) # inputs still has the value from the previous action inputs = (p, v, a) # ang not a because it is already in the bin form if action >= 0 and qnn: reward = nf.get_reward_nn_specific( inputs, prev_inputs, bin_values) # Reward for the current state future_action_value = nf.get_nn_output( net, inputs) # Outputs for acting from current state # nn_out holds the output for the previous state # prev_inputs holds the previous state # action holds the action taken from the previous state to get to current state # reward is reward for being in current state if reward != 0: # Only update if it does something print "Curr state:", inputs, " Prev state:", prev_inputs print "Prev output:", nf.get_nn_output(net, prev_inputs) net = nf.update_nn(net, prev_inputs, action, nn_out, reward, future_action_value, epochs=50, learn_rate_update_func=0.2) print "New output:", nf.get_nn_output(net, prev_inputs), "\n" nn_out = nf.get_nn_output(net, inputs) # Feed to the NN action = np.argmax(nn_out) nao.record_current_state( pos, vel, ang, action) # Record the data for plotting performance move = 0 if action == 0: # [1, 0, -1] move = 1 elif action == 2: move = -1 elif only_two_actions: if nn_out[0] > nn_out[2]: move = 1 else: move = -1 nao.interpolate_angles_relative_lr(move, 10, 2, 7) nao.go_to_interpolated_angles_lr() print ball_values, inputs, move, "\n" time.sleep(wait_time) nf.save_network(net) nao.save_state_data(0, bin_values[0], bin_values[1], bin_values[2], max_values[0], max_values[1], max_values[2], "Performance on the nao with nn", nao.ip, two_act=only_two_actions, qnn=qnn) nao.hands_open() time.sleep(2) nao.rest()
def collect_experiences(nao, update_experience=True, save_exp=False, wait_time=0.0, run_time=300, ball_update=0.01, delay=False): # Set up the nao nao.initial_setup() nao.set_up_to_hold_tray() # move hands out in front nao.hands_open() # Open hands raw_input( "Press enter to continue: ") # Wait so the user can position tray nao.hands_grab() # Close the hands and grab the tray nao.continually_update_ball_information( ball_update ) # update the current recorded position of the ball every ball_update seconds bin_values = __get_bin_values_q() max_values = __get_max_values() nao.num_angles = bin_values[2] q_mat = nf.load_q_matrix(bin_values[0], bin_values[1], bin_values[2], act=2, is_nao=False, is_delay=delay) ang_val = nao.angle_lr_interpolation if update_experience: experiences = nf.load_exp(bin_values[0], bin_values[1], bin_values[2], 2) else: experiences = np.empty( (bin_values[0], bin_values[1], bin_values[2], 1), dtype=object) # Need dtype=object for i in range(len(experiences)): for j in range(len(experiences[i])): for k in range(len(experiences[i][j])): experiences[i][j][k][0] = [] experiences = nf.clean_experiences(experiences) prev_state = (-1, -1, -1) prev_vals = (-1, -1, -1) action = 0 for i in range(0, run_time): pos = nao.ball_pos_lr # Get the NN inputs based off the state of the ball and tray vel = nao.ball_vel_lr ang = nao.ball_ang_lr ball_values = (pos, vel, ang) while ball_values == prev_vals: # Make sure have actually changed pos = nao.ball_pos_lr vel = nao.ball_vel_lr ang = nao.ball_ang_lr ball_values = (pos, vel, ang) if abs(vel) < 0.05: # Stop flickering between two bins vel = 0 p, v, a = nf.inputs_to_bins(ball_values, max_values, bin_values) p = int(p) v = int(v) a = int(a) print "State:", p, v, a, q_mat[p, v, a], vel reward = nf.get_reward(p, v, bin_values[0], bin_values[1]) * 5 if i > 1: curr_state = (p, v, a) exp = {"new_state": curr_state, "action": action, "reward": reward} print( len(experiences[prev_state[0], prev_state[1], prev_state[2], 0])) if len(experiences[prev_state[0], prev_state[1], prev_state[2], 0]) < 20: experiences[prev_state[0], prev_state[1], prev_state[2], 0] = np.append( experiences[prev_state[0], prev_state[1], prev_state[2], 0], exp) print(experiences[prev_state[0], prev_state[1], prev_state[2]][0]) if random.random( ) < 0.2: # Only update action 1/2 of the time - allows for series of actions action = nf.get_action(q_mat, p, v, int(a), bin_values[0], bin_values[1], consensus=False) #action = np.argmax(q_mat[p][v][int(a)]) if random.random() < 0.2: action = (action + 1) % 2 move = 0 if action == 1: # Anticlockwise move = 1 elif action == 0: # Clockwise move = -1 print "Action", move ang_val += move if ang_val < 0: ang_val = 0 if ang_val >= nao.num_angles: ang_val = nao.num_angles - 1 nao.interpolate_angles_relative_lr(move, bin_values[2], 2, 7) # 2, bin_values[2]-3) nao.ball_ang_lr = ang_val nao.go_to_interpolated_angles_lr() # Remember values for next loop prev_vals = (pos, vel, a) prev_state = (p, v, int(a)) time.sleep(wait_time) nao.record_current_state( pos, vel, a, move) # Record the data for plotting performance nao.hands_open() time.sleep(2) nao.rest() nf.save_q(q_mat, bin_values[0], bin_values[1], bin_values[2], 2, True) nao.save_state_data(1, bin_values[0], bin_values[1], bin_values[2], max_values[0], max_values[1], max_values[2], "Performance on the nao with q-matrix", nao.ip) if save_exp: sumup = 0 nf.save_exp(experiences, bin_values[0], bin_values[1], bin_values[2], max_values[0], max_values[1], max_values[2], 2) for i in range(0, len(experiences)): for j in range(0, len(experiences[i])): for k in range(0, len(experiences[i][j])): print i, j, k, experiences[i][j][k][0] sumup += len(experiences[i][j][k][0]) print "num experiences:", sumup