def train(agent, sim, replay_buffer): """Train the agent for exploration steps Args: :param replay_buffer: (ReplayBuffer) replay buffer for arm :param sim: (robot environment) vrep simulation :param agent: (Agent) agent to use, TD3 Algorithm """ best_avg = -10 start_time = time.time() # initialize graphs graphs = Graphs() while glo.TIMESTEP < cons.EXPLORATION: if glo.EPISODE == cons.MAX_EPISODE: print('Max Episode Reached: {}'.format(cons.MAX_EPISODE)) break print('Timesteps: {}/{}.'.format(glo.TIMESTEP, cons.EXPLORATION)) glo.EPISODE += 1 # start at episode 1, after the test so the last episode runs # get the current state of the robot arms (joint angles) right_pos, left_pos = sim.get_current_position() state = right_pos + left_pos # video recording video_array = [] if glo.EPISODE % cons.VIDEO_INTERVAL == 0: video_record = True video_array.append(sim.get_video_image()) else: video_record = False # reset episode variables solved = False # reset each episode as unsolved rewards = [ ] # used to store the episode rewards, used to average the rewards. collision_count = 0 episode_length = 0 # ------------------------------------------------------------------------------------------------- # Start Episode # ------------------------------------------------------------------------------------------------- while True: glo.TIMESTEP += 1 episode_length += 1 # check memory utilization ''' GPUtil.showUtilization() print(torch.cuda.memory_allocated()) count_objs = 0 for obj in gc.get_objects(): try: if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)): # print(type(obj), obj.size()) count_objs += 1 except: pass print("Total objects in GPU memory: {}".format(count_objs)) ''' # get the initial location of the target object target_start = sim.get_target_position() new_state = [] if set_mode.MODE == 'cooperative': # get a new action based on policy action = agent.select_action(np.array(state), noise=cons.POLICY_NOISE).tolist() # apply the action and get the new state right_state, left_state = sim.step_arms(action[:7], action[7:]) new_state = right_state + left_state elif set_mode.MODE == 'independent' or set_mode.MODE == 'partial': right_action = agent.select_action( np.array(state)[:7], 'right', noise=cons.POLICY_NOISE).tolist() left_action = agent.select_action( np.array(state)[7:], 'left', noise=cons.POLICY_NOISE).tolist() # apply the action and get the new state right_state, left_state = sim.step_arms( right_action, left_action) new_state = right_state + left_state # store the actions together in the replay action = right_action + left_action # add the image to the video array if video_record: video_array.append(sim.get_video_image()) # get the new position of the target target_end = sim.get_target_position() target_x, target_y, target_z = target_end # calculate the reward and distance to target for the step reward, distance_moved, distance_to_target = rew.target_movement_reward( target_start, target_end, cons.XYZ_GOAL) # TODO: might want to use a range for each x,y,z rather than rounding. # round the x, y, z to compare with the goal positions round_target_x = round(target_x, 1) round_target_y = round(target_y, 1) round_target_z = round(target_z, 1) # check and see if the target is within the goal range if round_target_x == cons.XYZ_GOAL[0] and round_target_y == cons.XYZ_GOAL[1] and \ round_target_z == cons.XYZ_GOAL[2]: # end episode if the goal is reached done = True else: done = False # check for multiple collisions in a row (5), reset sim if so object_collision_table = sim.object_collision_state() if object_collision_table: # if it collides with the table 5 times in a row - end episode collision_count += 1 if collision_count > 4: done = True collision_count = 0 else: collision_count = 0 # check distance between grippers to see if dropped if sim.check_suction_distance() > .32: done = True time.sleep(1) # wait to allow the sim to catch up reward = -1 # if it is dropped, reward is zero. end the episode and start a new one, it was very bad to drop it. if not sim.check_suction_prox(): done = True time.sleep(1) # wait to allow the sim to catch up reward = -1 # was zero, try a big, bad reward when you drop it # update the replay buffer with new tuple replay_buffer.add(state, action, reward, new_state, done) # training step agent.train(replay_buffer, episode_length) # set the state to the new state for the next step state = new_state if solved: print('Solved on Episode: {}'.format(glo.EPISODE)) # add the reward to the reward list rewards.append(reward) if episode_length == 50: # stop after 50 attempts, 30 was too low to reach goal, tried 45. done = True # stop after 50 attempts, was getting stuck flipping from bad to good. # calculate the elapsed time elapsed_time = time.time() - start_time if cons.WRITE_TO_FILE: cons.report.write_report_step(glo.EPISODE, glo.TIMESTEP, reward, distance_moved, distance_to_target, solved, elapsed_time) if done: # get the average reward for the episode mean_reward_episode = mean(rewards) # if current episodes reward better, becomes new save if best_avg < mean_reward_episode: best_avg = mean_reward_episode agent.save() if video_record and episode_length > 10: # only record episodes over 10 output_video(video_array, cons.SIZE, names.EPISODE_VIDEO) if solved: output_video(video_array, cons.SIZE, names.EPISODE_VIDEO_SOLVED) system_info = psutil.virtual_memory() if True: print("\n*** Episode " + str(glo.EPISODE) + " ***") print("Avg_Reward [last episode of " + str(episode_length) + " steps]: " + str(mean_reward_episode)) print("Total Timesteps: " + str(glo.TIMESTEP)) print("Elapsed Time: ", time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) print("Memory Usage: " + str(system_info.percent) + "%") # reset the simulation at the end of the episode sim.reset_sim() # at the end of the episode, visualize the charts. graphs.update_step_list_graphs() # end the episode break # check for system overload, if memory over-utilized, quit system_info = psutil.virtual_memory() if system_info.percent > 98: break # write any remaining values cons.report.write_final_values() # at the end do a final update of the graphs graphs.update_step_list_graphs()
from reports import Reports from graphs import Graphs import random import globals import constants as cons if __name__ == "__main__": # create report # report = Reports() report = cons.TD3_REPORT graph = Graphs() # fills reports with dummy values for testing counter = 0 anneal = .001 for i in range(10000): for j in range(random.randint(1, 25)): counter += 1 anneal += (.00001 * random.randint(-1, 2)) report.write_report_step(i, counter, random.random() + anneal, random.random() + anneal, random.random() + anneal, False, 'date') report.write_report_actor(i, counter, random.random(), random.random()) report.write_report_critic(i, counter, random.random(), random.random()) report.write_report_error(i, counter, random.random()) graph.update_step_list_graphs()