def init(target_pos, init_pose, init_angle_velocities, init_velocities, runtime, action_low, action_high, agent_type, action_repeat, action_size, success_mem_len, gamma=0.9, tau=0.1, buffer_size=100000, batch_size=128, exploration_mu=0, exploration_theta=0.15, exploration_sigma=0.2, success_distance=1): task = TakeOff_Task(target_pos=target_pos, init_pose=init_pose, init_angle_velocities=init_angle_velocities, init_velocities=init_velocities, runtime=runtime) task.configure(action_repeat=action_repeat, action_low=action_low, action_high=action_high, action_size=action_size, target_pos=target_pos, init_velocities=init_velocities, init_angle_velocities=init_angle_velocities, init_pose=init_pose, success_distance=success_distance) if agent_type == 'DDPG': agent = DDPG_Agent(task) agent.configure(gamma, tau, buffer_size, batch_size, exploration_mu, exploration_theta, exploration_sigma) if agent_type == 'Policy_Search': agent = PolicySearch_Agent(task) if agent_type == 'Random_Binary': agent = Random_Binary_Agent(task) agent.configure(success_mem_len) if agent_type == 'Simple': agent = DDPG_Agent_Simple(task) agent.configure(gamma, tau, buffer_size, batch_size, exploration_mu, exploration_theta, exploration_sigma) return task, agent
# The sample agent given in `agents/policy_search.py` uses a very simplistic linear policy to directly compute the action vector as a dot product of the state vector and a matrix of weights. Then, it randomly perturbs the parameters by adding some Gaussian noise, to produce a different policy. Based on the average reward obtained in each episode (`score`), it keeps track of the best set of parameters found so far, how the score is changing, and accordingly tweaks a scaling factor to widen or tighten the noise. # # Run the code cell below to see how the agent performs on the sample task. # In[64]: import sys import pandas as pd from agents.policy_search import PolicySearch_Agent from task import Task num_episodes = 1000 target_pos = np.array([0., 0., 10.]) task = Task(target_pos=target_pos) agent = PolicySearch_Agent(task) for i_episode in range(1, num_episodes+1): state = agent.reset_episode() # start a new episode while True: action = agent.act(state) next_state, reward, done = task.step(action) agent.step(reward, done) state = next_state if done: print("\rEpisode = {:4d}, score = {:7.3f} (best = {:7.3f}), noise_scale = {}".format( i_episode, agent.score, agent.best_score, agent.noise_scale), end="") # [debug] break sys.stdout.flush() print("\nfinished running agent")
from task import Task import numpy as np import matplotlib.pyplot as plt # Modify the values below to give the quadcopter a different starting position. runtime = 5. # time limit of the episode init_pose = np.array([5., 5., 5., 0., 0., 0.]) # initial pose init_velocities = np.array([0., 0., 0.]) # initial velocities init_angle_velocities = np.array([0., 0., 0.]) # initial angle velocities file_output = 'data.txt' # file name for saved results num_episodes = 1000 target_pos = np.array([5., 5., 5.]) task = Task(init_pose, init_velocities, init_angle_velocities, runtime, target_pos) agent = PolicySearch_Agent(task) rewards = [] for i_episode in range(1, num_episodes + 1): state = agent.reset_episode() # start a new episode while True: action = agent.act(state) next_state, reward, done = task.step(action) agent.step(reward, done) state = next_state if done: rewards.append(agent.total_reward) print( "\rEpisode = {:4d}, score = {:7.3f} (best = {:7.3f}), noise_scale = {}" .format(i_episode, agent.total_reward, agent.best_reward, agent.noise_scale),
import sys from agents.policy_search import PolicySearch_Agent from task import Task import numpy as np num_episodes = 1000 target_pos = np.array([0., 0., 10.]) task = Task(target_pos=target_pos) agent = PolicySearch_Agent(task) for i_episode in range(1, num_episodes + 1): state = agent.reset_episode() while True: action = agent.act(state) next_state, reward, done = task.step(action) agent.step(reward, done, next_state, done) state = next_state if done: print( "\rEpisode = {:4d}, score = {:7.3f} (best = {:7.3f}), noise_scale = {}" .format(i_episode, agent.score, agent.best_score, agent.noise_scale), end="") # [debug] break sys.stdout.flush()