def main(num_episodes: int = 200): target_pos = np.array([0., 0., 140.]) task = Task(target_pos=target_pos) agent = DDPG(task) best_score = -1000 best_x = 0 best_y = 0 best_z = 0 best_episode = 0 data = {} for i_episode in range(1, num_episodes + 1): state = agent.reset_episode() # start a new episode score = 0 while True: action = agent.act(state) next_state, reward, done = task.step(action) agent.step(action, reward, next_state, done) state = next_state score += reward if score > best_score: best_x = task.sim.pose[0] best_y = task.sim.pose[1] best_z = task.sim.pose[2] best_episode = i_episode best_score = max(score, best_score) data[i_episode] = {'Episode': i_episode, 'Reward': score, 'Action': action, 'Best_Score': best_score, 'x': task.sim.pose[0], 'y': task.sim.pose[1], 'z': task.sim.pose[2]} if done: print( "\rEpisode = {:4d}, score = {:7.3f} (best = {:7.3f}), last_position = ({:5.1f},{:5.1f},{:5.1f}), best_position = ({:5.1f},{:5.1f},{:5.1f})".format( i_episode, score, best_score, task.sim.pose[0], task.sim.pose[1], task.sim.pose[2], best_x, best_y, best_z), end="") break sys.stdout.flush()
import gym from pendulum_task import PendulumTask from agents.agent import DDPG task = PendulumTask() agent = DDPG(task) env = gym.make("Pendulum-v0") done = False n_episodes = 400 rewards = np.zeros(n_episodes) for i in range(n_episodes): cur_state = env.reset() agent.reset_episode(cur_state) while True: env.render() random_action = env.action_space.sample() action = agent.act(cur_state) new_state, reward, done, _ = env.step(action) rewards[i] += reward #train step agent.step(action, reward, new_state, done) if done: print("\rEpisode = {:4d}, total_reward = {:7.3f}".format( i, rewards[i])) break else: cur_state = new_state
def drive(num_episodes=1000, sample_distance=100, task_renew_distance=100, target_pos=np.array([0., 0., 10.]), initial_pos=None, sample_cb=None, running_size=10): agent = DDPG() positions = [] rewards = [] initial_poss = [] target_poss = [] distances = [] times = [] running_reward = [] running_time = [] running_distance = [] max_reward = -100000 for i_episode in range(0, num_episodes): if i_episode % task_renew_distance == 0: epi_init_pos, task = new_task(initial_pos, target_pos) agent.new_task(task) state = agent.reset_episode() epi_positions = [] epi_reward = 0 epi_distances = [] while True: action = agent.act(state) next_state, reward, done = task.step(action) agent.step(action, reward, next_state, done) state = next_state epi_reward += reward epi_positions.append(task.sim.pose[:3]) epi_distances.append(task.current_distance) if done: break avg_distance = np.average(epi_distances) print("\rEpisode = {:4d}, Reward = {:4n}, Avg Distance = {:4n}, time = {:4n}".format(i_episode + 1, epi_reward, avg_distance, task.sim.time), end="") rewards.append(epi_reward) distances.append(avg_distance) times.append(task.sim.time) if running_size < i_episode: running_reward.append(np.average(rewards[i_episode - running_size : i_episode])) running_time.append(np.average(times[i_episode - running_size : i_episode])) running_distance.append(np.average(distances[i_episode - running_size : i_episode])) else: running_reward.append(0) running_time.append(0) running_distance.append(0) positions.append(epi_positions) if i_episode % sample_distance == 0: max_reward = max([max_reward, epi_reward]) initial_poss.append(epi_init_pos) target_poss.append(target_pos) if sample_cb is not None: sample_cb(epi_init_pos, positions, target_pos, rewards, distances, times, running_reward, running_time, running_distance) positions = [] sys.stdout.flush() return epi_init_pos, positions, target_pos, rewards, distances, times, running_reward, running_time, running_distance
def main(): labels = [ 'time', 'x', 'y', 'z', 'phi', 'theta', 'psi', 'x_velocity', 'y_velocity', 'z_velocity', 'phi_velocity', 'theta_velocity', 'psi_velocity', 'rotor_speed1', 'rotor_speed2', 'rotor_speed3', 'rotor_speed4' ] file_output = 'data.txt' # write initial row with open(file_output, 'w') as csvfile: writer = csv.writer(csvfile) writer.writerow(labels) num_episodes = 1000 run_time = 10. target_pos = np.array([0., 0., 10.]) # takeoff and stay in place init_pose = np.array([0.0, 0.0, 1.0, 0.0, 0.0, 0.0]) init_velocities = np.array([0.0, 0.0, 0.0]) init_angle_velocities = np.array([0.0, 0.0, 0.0]) task = TakeoffTask(init_pose=init_pose, target_pos=target_pos, runtime=run_time) agent = DDPG(task) best_score = -np.inf results_list = [] rewards_list = [] for i_episode in range(1, num_episodes + 1): state = agent.reset_episode() # start a new episode count = 0 total_reward = 0 results = {x: [] for x in labels} rewards = [] while True: action = agent.act(state) # noise is added for exploration next_state, reward, done = task.step(action) total_reward += reward rewards.append(reward) agent.step(action, reward, next_state, done) state = next_state to_write = [task.sim.time] + list(task.sim.pose) + list( task.sim.v) + list(task.sim.angular_v) + list(action) for ii in range(len(labels)): results[labels[ii]].append(to_write[ii]) write_to_csv(to_write) count += 1 if done: score = total_reward / float(count) if count else 0.0 results_list.append(results) rewards_list.append(rewards) if score > best_score: best_score = score # plot every 200 episodes if i_episode % 200 == 0: print('i should be plotting something now.') print('episode {}'.format(i_episode)) print( "\rEpisode = {:4d}, score = {:7.3f}, best_score = {:7.3f}, reward for episode = {}" .format(i_episode, score, best_score, total_reward), end="") # [debug] break sys.stdout.flush()