def main(argv): # Training parameters n_iterations = 10 display_freq = 2 # Starting position init_pose = np.zeros(6) init_pose[2] = 10.0 # Parse arguments try: opts, args = getopt.getopt(argv, "hn:d:a:") except getopt.GetoptError: sys.exit(2) print_usage() for opt, arg in opts: if opt == "-h": print_usage() sys.exit() elif opt == "-n": n_iterations = int(arg) elif opt == "-d": display_freq = int(arg) elif opt == "-a": init_pose[2] = float(arg) task = Takeoff(init_pose) agent = DDPG(task) trainer = Trainer(agent, show_graph=True, show_stats=True) print("\n\nStarting DDPG training for {:4d} iterations, z(t_0)={:4.1f}m". format(n_iterations, init_pose[2])) plt.ion() trainer.train(n_iterations, display_freq=display_freq, n_update_decay=3) plt.ioff()
def run_test_episode(agent : DDPG, task : Task, file_output): print('\nRunning test episode ...') labels = ['time', 'x', 'y', 'z', 'phi', 'theta', 'psi', 'x_velocity', 'y_velocity', 'z_velocity', 'phi_velocity', 'theta_velocity', 'psi_velocity', 'rotor_speed1', 'rotor_speed2', 'rotor_speed3', 'rotor_speed4' ,'reward'] results = {x : [] for x in labels} aux_noise = copy.copy(agent.noise) agent.noise = OUNoise(agent.action_size, 0.0, 0.0, 0.0) state = agent.reset_episode() # start a new episode rewards_lists = defaultdict(list) print('state', state) print('state.shape', state.shape) # Run the simulation, and save the results. with open(file_output, 'w') as csvfile: writer = csv.writer(csvfile) writer.writerow(labels) while True: rotor_speed = agent.act(state) rotor_speeds = np.array([rotor_speed ] *4) # rotor_speeds = [405]*4 # rotor_speeds = [500, 490, 500, 500] next_state, reward, done, new_rewards = task.step(rotor_speeds) for key, value in new_rewards.items(): rewards_lists[key].append(value) to_write = [task.sim.time] + list(task.sim.pose) + list(task.sim.v) + list(task.sim.angular_v) + list \ (rotor_speeds) + [reward] for ii in range(len(labels)): results[labels[ii]].append(to_write[ii]) writer.writerow(to_write) state = next_state if done: break # Restore noise agent.noise = copy.copy(aux_noise) print('Finished test episode!\n') return results, rewards_lists
def main(num_episodes: int = 200): target_pos = np.array([0., 0., 140.]) task = Task(target_pos=target_pos) agent = DDPG(task) best_score = -1000 best_x = 0 best_y = 0 best_z = 0 best_episode = 0 data = {} for i_episode in range(1, num_episodes + 1): state = agent.reset_episode() # start a new episode score = 0 while True: action = agent.act(state) next_state, reward, done = task.step(action) agent.step(action, reward, next_state, done) state = next_state score += reward if score > best_score: best_x = task.sim.pose[0] best_y = task.sim.pose[1] best_z = task.sim.pose[2] best_episode = i_episode best_score = max(score, best_score) data[i_episode] = {'Episode': i_episode, 'Reward': score, 'Action': action, 'Best_Score': best_score, 'x': task.sim.pose[0], 'y': task.sim.pose[1], 'z': task.sim.pose[2]} if done: print( "\rEpisode = {:4d}, score = {:7.3f} (best = {:7.3f}), last_position = ({:5.1f},{:5.1f},{:5.1f}), best_position = ({:5.1f},{:5.1f},{:5.1f})".format( i_episode, score, best_score, task.sim.pose[0], task.sim.pose[1], task.sim.pose[2], best_x, best_y, best_z), end="") break sys.stdout.flush()
allow_soft_placement=True, log_device_placement=True)) import pandas as pd import numpy as np from agents.policy_search import PolicySearch_Agent from agents.agent import DDPG from task import TaskDefault, TaskFlyUp, TaskFlyTowardsGoal from runSimulation import runSimulation # init task (reward structure), and agent # simulation time and number of episodes init_pose = np.array([10., 10., 10.0, 0., 0., 0.]) target_pose = np.array([0., 0., 20.]) #SMM original [0., 0., 10.] simTime = 5 # make the sim run longer so the agent has more chance to adapt num_episodes = 2500 task = TaskFlyTowardsGoal(init_pose=init_pose, target_pos=target_pose, runtime=simTime) useDefault = False my_agent = DDPG(task, useDefault) print(my_agent) print(task) print("init_pose: ", init_pose) print("target_pose: ", target_pose) # Run the simulation and save the results. showPlotEachEpisode = False file_output = 'my_agent.txt' # save my results runSimulation(init_pose, target_pose, simTime, num_episodes, task, my_agent,\ showPlotEachEpisode, file_output)
# In[65]: ## TODO: Train your agent here. import sys import pandas as pd from agents.agent import DDPG #from tasks.takeoff import Task from task import Task import csv num_episodes = 500 target_pos = np.array([0., 0., 100.]) task = Task(target_pos=target_pos) agent = DDPG(task) worst_score = 1000000 best_score = -1000000. reward_log = "reward.txt" reward_labels = ['episode', 'reward'] reward_results = {x : [] for x in reward_labels} print("finished with setup") # In[66]: for i_episode in range(1, num_episodes+1): state = agent.reset_episode() # start a new episode
import numpy as np import gym from pendulum_task import PendulumTask from agents.agent import DDPG task = PendulumTask() agent = DDPG(task) env = gym.make("Pendulum-v0") done = False n_episodes = 400 rewards = np.zeros(n_episodes) for i in range(n_episodes): cur_state = env.reset() agent.reset_episode(cur_state) while True: env.render() random_action = env.action_space.sample() action = agent.act(cur_state) new_state, reward, done, _ = env.step(action) rewards[i] += reward #train step agent.step(action, reward, new_state, done) if done: print("\rEpisode = {:4d}, total_reward = {:7.3f}".format( i, rewards[i])) break else:
def drive(num_episodes=1000, sample_distance=100, task_renew_distance=100, target_pos=np.array([0., 0., 10.]), initial_pos=None, sample_cb=None, running_size=10): agent = DDPG() positions = [] rewards = [] initial_poss = [] target_poss = [] distances = [] times = [] running_reward = [] running_time = [] running_distance = [] max_reward = -100000 for i_episode in range(0, num_episodes): if i_episode % task_renew_distance == 0: epi_init_pos, task = new_task(initial_pos, target_pos) agent.new_task(task) state = agent.reset_episode() epi_positions = [] epi_reward = 0 epi_distances = [] while True: action = agent.act(state) next_state, reward, done = task.step(action) agent.step(action, reward, next_state, done) state = next_state epi_reward += reward epi_positions.append(task.sim.pose[:3]) epi_distances.append(task.current_distance) if done: break avg_distance = np.average(epi_distances) print("\rEpisode = {:4d}, Reward = {:4n}, Avg Distance = {:4n}, time = {:4n}".format(i_episode + 1, epi_reward, avg_distance, task.sim.time), end="") rewards.append(epi_reward) distances.append(avg_distance) times.append(task.sim.time) if running_size < i_episode: running_reward.append(np.average(rewards[i_episode - running_size : i_episode])) running_time.append(np.average(times[i_episode - running_size : i_episode])) running_distance.append(np.average(distances[i_episode - running_size : i_episode])) else: running_reward.append(0) running_time.append(0) running_distance.append(0) positions.append(epi_positions) if i_episode % sample_distance == 0: max_reward = max([max_reward, epi_reward]) initial_poss.append(epi_init_pos) target_poss.append(target_pos) if sample_cb is not None: sample_cb(epi_init_pos, positions, target_pos, rewards, distances, times, running_reward, running_time, running_distance) positions = [] sys.stdout.flush() return epi_init_pos, positions, target_pos, rewards, distances, times, running_reward, running_time, running_distance
params.exploration_mu = 0 params.exploration_theta = 0.15 params.exploration_sigma = 0.02 #0.002 params.actor_learning_rate = 1.0e-5 # 0.0001 params.critic_learning_rate = 0.001 # 0.001 params.tau = 0.001 params.actor_net_cells = [16*2, 16*2] params.critic_net_cells = [16*2, 32*2] params.gamma = 0.99 # test_values = [1.0e-3, 1.0e-4, 1.0e-5,1.0e-6, 1.0e-7] # actor_learning_rate DONE # test_values = [1.0e-2, 1.0e-3, 1.0e-4,1.0e-5] # critic_learning_rate DONE # test_values = [0.9, 0.99] # gamma DONE # test_values = [0.2, 0.02, 0.002, 0.0002] # exploration_sigma test_values = [0.1, 0.01, 0.001, 0.0001] # tau # Think how to do the networks batch. for test_value in test_values: params.tau = test_value task = Task(init_pose = init_pose, init_velocities = init_velocities, target_pos = target_pos) agent = DDPG(task, params, buffer_size = buffer_size, batch_size = batch_size ) run_training(agent, task, params, num_episodes, file_output)
import pandas as pd import numpy as np import matplotlib.pyplot as plt from agents.agent import DDPG from tasks.takeoff import Task from collections import deque # time limit of the episode init_pose = np.array([0., 0., 10., 0., 0., 0.]) # initial pose init_velocities = np.array([0., 0., 0.]) # initial velocities init_angle_velocities = np.array([0., 0., 0.]) # initial angle velocities num_episodes = 500 target_pos = np.array([0., 0., 100.]) task = Task(target_pos=target_pos, init_pose=init_pose, init_angle_velocities=init_angle_velocities, init_velocities=init_velocities) agent = DDPG(task) worst_score = float('inf') best_score = float('-inf') reward_labels = ['episode', 'reward', 'rolling10', 'rolling100'] reward_results = {x : [] for x in reward_labels} rolling_score_10 = deque(maxlen=10) rolling_score_100 = deque(maxlen=100) for i_episode in range(1, num_episodes+1): state = agent.reset_episode() # start a new episode score = 0 while True: action = agent.act(state) next_state, reward, done = task.step(action) agent.step(action, reward, next_state, done)
def main(): labels = [ 'time', 'x', 'y', 'z', 'phi', 'theta', 'psi', 'x_velocity', 'y_velocity', 'z_velocity', 'phi_velocity', 'theta_velocity', 'psi_velocity', 'rotor_speed1', 'rotor_speed2', 'rotor_speed3', 'rotor_speed4' ] file_output = 'data.txt' # write initial row with open(file_output, 'w') as csvfile: writer = csv.writer(csvfile) writer.writerow(labels) num_episodes = 1000 run_time = 10. target_pos = np.array([0., 0., 10.]) # takeoff and stay in place init_pose = np.array([0.0, 0.0, 1.0, 0.0, 0.0, 0.0]) init_velocities = np.array([0.0, 0.0, 0.0]) init_angle_velocities = np.array([0.0, 0.0, 0.0]) task = TakeoffTask(init_pose=init_pose, target_pos=target_pos, runtime=run_time) agent = DDPG(task) best_score = -np.inf results_list = [] rewards_list = [] for i_episode in range(1, num_episodes + 1): state = agent.reset_episode() # start a new episode count = 0 total_reward = 0 results = {x: [] for x in labels} rewards = [] while True: action = agent.act(state) # noise is added for exploration next_state, reward, done = task.step(action) total_reward += reward rewards.append(reward) agent.step(action, reward, next_state, done) state = next_state to_write = [task.sim.time] + list(task.sim.pose) + list( task.sim.v) + list(task.sim.angular_v) + list(action) for ii in range(len(labels)): results[labels[ii]].append(to_write[ii]) write_to_csv(to_write) count += 1 if done: score = total_reward / float(count) if count else 0.0 results_list.append(results) rewards_list.append(rewards) if score > best_score: best_score = score # plot every 200 episodes if i_episode % 200 == 0: print('i should be plotting something now.') print('episode {}'.format(i_episode)) print( "\rEpisode = {:4d}, score = {:7.3f}, best_score = {:7.3f}, reward for episode = {}" .format(i_episode, score, best_score, total_reward), end="") # [debug] break sys.stdout.flush()
MAX_STEPS = 700 # Maximum number of steps to run per episode ACTOR_LR = 1e-4 # Actor network learning rate CRITIC_LR = 1e-3 # Critic network learning rate MU = 0.0 # Ornstein-uhlenbeck noise parameter THETA = 0.15 # Ornstein-uhlenbeck noise parameter SIGMA = 0.2 # Ornstein-uhlenbeck noise parameter BUFFER_SIZE = 1000000 # Max size of the replay buffer BATCH_SIZE = 128 # Number of samples to pick from replay buffer GAMMA = 0.99 # Discount factor TAU = 0.001 # Soft update to target network factor # Create the environment env = gym.make('BipedalWalker-v2') # Create the agent agent = DDPG(env, ACTOR_LR, CRITIC_LR, MU, THETA, SIGMA, BUFFER_SIZE, BATCH_SIZE, GAMMA, TAU) # Reset the environment S = env.reset() rewards = [] # Train the DDPG agent in the environment for episode in range(1, NUM_EPISODES + 1): state = agent.reset_episode() # Start a new episode while True: env.render() # Perform the action given by the actor network + noise action = agent.act(state) next_state, reward, done, info = env.step(action)
import threading import queue from PIL import Image from yolo import YOLO from agents.agent import DDPG LOG_FILENAME = 'output.log' logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) # Speed of the drone S = 60 # Frames per second of the pygame window display FPS = 25 yolov3 = YOLO() agent = DDPG() class FrontEnd(object): """ Maintains the Tello display and moves it through the keyboard keys. Press escape key to quit. The controls are: - T: Takeoff - L: Land - Arrow keys: Forward, backward, left and right. - A and D: Counter clockwise and clockwise rotations - W and S: Up and down. """ def __init__(self): # Init pygame pygame.init()