def train(): runtime = 5. # time limit of the episode init_pose = np.array([0., 0., 4.0, 0., 0., 0.0]) # initial pose init_velocities = np.array([0., 0., 0.0]) # initial velocities init_angle_velocities = np.array([0., 0., 0.]) # initial angle velocities file_output = 'rewards.txt' # file name for saved results num_episodes = 10 target_pos = np.array([0., 0., 40.]) task = Task(init_pose=init_pose, init_velocities=init_velocities, init_angle_velocities=init_angle_velocities, target_pos=target_pos) agent = DDPG(task) labels = ['episod', 'avg_reward', 'total_reward'] results = {x: [] for x in labels} with open(file_output, 'w') as csvfile: writer = csv.writer(csvfile) writer.writerow(labels) best_total_reward = -1000 for i_episode in range(1, num_episodes + 1): state = agent.reset_episode() # start a new episode total_reward = 0 rewards = [] while True: # select action according to the learned policy and the exploration noise action = agent.act(state) # execute the action and observe the reward and the next state next_state, reward, done = task.step(action) # sample mini batch and learn agent.step(action, reward, next_state, done) # data tracking total_reward += reward rewards.append(reward) if total_reward > best_total_reward: best_total_reward = total_reward state = next_state if done: avg_reward = np.mean(np.array(rewards)) print(task.sim.pose) #to_write = [task.sim.time] + list(task.sim.pose) + list(task.sim.v) + list(task.sim.angular_v) + list(rotor_speeds) #for ii in range(len(labels)): # results[labels[ii]].append(to_write[ii]) #writer.writerow(to_write) to_write = [i_episode] + [avg_reward] + [total_reward] for ii in range(len(labels)): results[labels[ii]].append(to_write[ii]) print( "\rEpisode = {:4d}, total_reward = {:7.3f}, avg_reward={:7.3} (best = {:7.3f})" .format(i_episode, total_reward, avg_reward, best_total_reward), end="") # [debug] break sys.stdout.flush() return agent
num_episodes = 1000 init_pose = np.array([0., 0., 0., 0., 0., 0.]) target_pos = np.array([0., 0., 10.]) init_velocities = np.array([0., 0., 0.]) # initial velocities init_angle_velocities = np.array([0., 0., 0.]) task = Task(init_pose=init_pose, target_pos=target_pos, init_angle_velocities=init_angle_velocities, init_velocities=init_velocities) best_score = -np.inf agent = DDPG(task) for i_episode in range(1, num_episodes + 1): state = agent.reset_episode() # start a new score = 0 while True: action = agent.act(state) next_state, reward, done = task.step(action) agent.step(action, reward, next_state, done) state = next_state score += reward best_score = max(best_score, score) if done: print("\rEpisode = {:4d}, score = {:7.3f} (best = {:7.3f})".format( i_episode, score, best_score), end="") # [debug] break sys.stdout.flush()