Esempio n. 1
0
def main(argv):
    # Training parameters
    n_iterations = 10
    display_freq = 2
    # Starting position
    init_pose = np.zeros(6)
    init_pose[2] = 10.0
    # Parse arguments
    try:
        opts, args = getopt.getopt(argv, "hn:d:a:")
    except getopt.GetoptError:
        sys.exit(2)
        print_usage()

    for opt, arg in opts:
        if opt == "-h":
            print_usage()
            sys.exit()
        elif opt == "-n":
            n_iterations = int(arg)
        elif opt == "-d":
            display_freq = int(arg)
        elif opt == "-a":
            init_pose[2] = float(arg)

    task = Takeoff(init_pose)
    agent = DDPG(task)
    trainer = Trainer(agent, show_graph=True, show_stats=True)

    print("\n\nStarting DDPG training for {:4d} iterations, z(t_0)={:4.1f}m".
          format(n_iterations, init_pose[2]))

    plt.ion()
    trainer.train(n_iterations, display_freq=display_freq, n_update_decay=3)
    plt.ioff()
def run_test_episode(agent : DDPG, task : Task, file_output):
    print('\nRunning test episode ...')

    labels = ['time', 'x', 'y', 'z', 'phi', 'theta', 'psi', 'x_velocity',
              'y_velocity', 'z_velocity', 'phi_velocity', 'theta_velocity',
              'psi_velocity', 'rotor_speed1', 'rotor_speed2', 'rotor_speed3', 'rotor_speed4' ,'reward']
    results = {x : [] for x in labels}

    aux_noise = copy.copy(agent.noise)
    agent.noise = OUNoise(agent.action_size, 0.0, 0.0, 0.0)

    state = agent.reset_episode() # start a new episode
    rewards_lists = defaultdict(list)
    print('state', state)
    print('state.shape', state.shape)

    # Run the simulation, and save the results.
    with open(file_output, 'w') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(labels)
        while True:
            rotor_speed = agent.act(state)
            rotor_speeds = np.array([rotor_speed ] *4)
            # rotor_speeds = [405]*4
            # rotor_speeds = [500, 490, 500, 500]
            next_state, reward, done, new_rewards = task.step(rotor_speeds)
            for key, value in new_rewards.items():
                rewards_lists[key].append(value)

            to_write = [task.sim.time] + list(task.sim.pose) + list(task.sim.v) + list(task.sim.angular_v) + list \
                (rotor_speeds) + [reward]
            for ii in range(len(labels)):
                results[labels[ii]].append(to_write[ii])
            writer.writerow(to_write)

            state = next_state


            if done:
                break

    # Restore noise
    agent.noise = copy.copy(aux_noise)

    print('Finished test episode!\n')
    return results, rewards_lists
Esempio n. 3
0
def main(num_episodes: int = 200):

    target_pos = np.array([0., 0., 140.])
    task = Task(target_pos=target_pos)
    agent = DDPG(task)
    best_score = -1000
    best_x = 0
    best_y = 0
    best_z = 0
    best_episode = 0
    data = {}

    for i_episode in range(1, num_episodes + 1):
        state = agent.reset_episode()  # start a new episode
        score = 0

        while True:
            action = agent.act(state)
            next_state, reward, done = task.step(action)
            agent.step(action, reward, next_state, done)
            state = next_state
            score += reward

            if score > best_score:
                best_x = task.sim.pose[0]
                best_y = task.sim.pose[1]
                best_z = task.sim.pose[2]
                best_episode = i_episode
            best_score = max(score, best_score)
            data[i_episode] = {'Episode': i_episode, 'Reward': score, 'Action': action, 'Best_Score': best_score,
                               'x': task.sim.pose[0], 'y': task.sim.pose[1], 'z': task.sim.pose[2]}
            if done:
                print(
                    "\rEpisode = {:4d}, score = {:7.3f} (best = {:7.3f}), last_position = ({:5.1f},{:5.1f},{:5.1f}), best_position = ({:5.1f},{:5.1f},{:5.1f})".format(
                        i_episode, score, best_score, task.sim.pose[0], task.sim.pose[1], task.sim.pose[2], best_x, best_y,
                        best_z), end="")
                break
        sys.stdout.flush()
  allow_soft_placement=True, log_device_placement=True))

import pandas as pd
import numpy as np
from agents.policy_search import PolicySearch_Agent
from agents.agent import DDPG
from task import TaskDefault, TaskFlyUp, TaskFlyTowardsGoal
from runSimulation import runSimulation

# init task (reward structure), and agent
# simulation time and number of episodes
init_pose = np.array([10., 10., 10.0, 0., 0., 0.])
target_pose = np.array([0., 0., 20.]) #SMM original [0., 0., 10.]
simTime = 5 # make the sim run longer so the agent has more chance to adapt
num_episodes = 2500
task = TaskFlyTowardsGoal(init_pose=init_pose, target_pos=target_pose, runtime=simTime)
useDefault = False
my_agent = DDPG(task, useDefault) 
print(my_agent)
print(task)
print("init_pose: ", init_pose)
print("target_pose: ", target_pose)

# Run the simulation and save the results.
showPlotEachEpisode = False
file_output = 'my_agent.txt' # save my results

runSimulation(init_pose, target_pose, simTime, num_episodes, task, my_agent,\
              showPlotEachEpisode, file_output)

Esempio n. 5
0
# In[65]:


## TODO: Train your agent here.
import sys
import pandas as pd
from agents.agent import DDPG
#from tasks.takeoff import Task
from task import Task
import csv

num_episodes = 500
target_pos = np.array([0., 0., 100.])
task = Task(target_pos=target_pos)
agent = DDPG(task) 
worst_score = 1000000
best_score = -1000000.
reward_log = "reward.txt"

reward_labels = ['episode', 'reward']
reward_results = {x : [] for x in reward_labels}

print("finished with setup")


# In[66]:


for i_episode in range(1, num_episodes+1):
    state = agent.reset_episode() # start a new episode
Esempio n. 6
0
import numpy as np
import gym
from pendulum_task import PendulumTask
from agents.agent import DDPG

task = PendulumTask()
agent = DDPG(task)
env = gym.make("Pendulum-v0")

done = False
n_episodes = 400
rewards = np.zeros(n_episodes)
for i in range(n_episodes):
    cur_state = env.reset()
    agent.reset_episode(cur_state)
    while True:
        env.render()
        random_action = env.action_space.sample()
        action = agent.act(cur_state)

        new_state, reward, done, _ = env.step(action)
        rewards[i] += reward

        #train step
        agent.step(action, reward, new_state, done)

        if done:
            print("\rEpisode = {:4d}, total_reward = {:7.3f}".format(
                i, rewards[i]))
            break
        else:
Esempio n. 7
0
def drive(num_episodes=1000, sample_distance=100, task_renew_distance=100, target_pos=np.array([0., 0., 10.]), initial_pos=None, sample_cb=None, running_size=10):
    agent = DDPG()

    positions = []
    rewards = []
    initial_poss = []
    target_poss = []
    distances = []
    times = []
    running_reward = []
    running_time = []
    running_distance = []

    max_reward = -100000

    for i_episode in range(0, num_episodes):

        if i_episode % task_renew_distance == 0:
            epi_init_pos, task = new_task(initial_pos, target_pos)
            agent.new_task(task)

        state = agent.reset_episode()

        epi_positions = []
        epi_reward = 0
        epi_distances = []

        while True:
            action = agent.act(state) 
            next_state, reward, done = task.step(action)
            agent.step(action, reward, next_state, done)
            state = next_state
            epi_reward += reward

            epi_positions.append(task.sim.pose[:3])
            epi_distances.append(task.current_distance)

            if done:
                break

        avg_distance = np.average(epi_distances)

        print("\rEpisode = {:4d}, Reward = {:4n}, Avg Distance = {:4n}, time = {:4n}".format(i_episode + 1, epi_reward, avg_distance, task.sim.time), end="")

        rewards.append(epi_reward)
        distances.append(avg_distance)
        times.append(task.sim.time)

        if running_size < i_episode:
            running_reward.append(np.average(rewards[i_episode - running_size : i_episode]))
            running_time.append(np.average(times[i_episode - running_size : i_episode]))
            running_distance.append(np.average(distances[i_episode - running_size : i_episode]))
        else:
            running_reward.append(0)
            running_time.append(0)
            running_distance.append(0)

        positions.append(epi_positions)

        if i_episode % sample_distance == 0:
            max_reward = max([max_reward, epi_reward])
            initial_poss.append(epi_init_pos)
            target_poss.append(target_pos)

            if sample_cb is not None:
                sample_cb(epi_init_pos, positions, target_pos, rewards, distances, times, running_reward, running_time, running_distance)

            positions = []

        sys.stdout.flush()

    return epi_init_pos, positions, target_pos, rewards, distances, times, running_reward, running_time, running_distance
params.exploration_mu = 0
params.exploration_theta = 0.15
params.exploration_sigma = 0.02 #0.002
params.actor_learning_rate = 1.0e-5 # 0.0001
params.critic_learning_rate = 0.001  # 0.001
params.tau = 0.001
params.actor_net_cells = [16*2, 16*2]
params.critic_net_cells = [16*2, 32*2]
params.gamma = 0.99

# test_values = [1.0e-3, 1.0e-4, 1.0e-5,1.0e-6, 1.0e-7] # actor_learning_rate DONE
# test_values = [1.0e-2, 1.0e-3, 1.0e-4,1.0e-5] # critic_learning_rate        DONE
# test_values = [0.9, 0.99] # gamma                                           DONE
# test_values = [0.2, 0.02, 0.002, 0.0002] # exploration_sigma
test_values = [0.1, 0.01, 0.001, 0.0001] # tau

# Think how to do the networks batch.

for test_value in test_values:
    params.tau = test_value

    task = Task(init_pose = init_pose,
                init_velocities = init_velocities,
                target_pos = target_pos)
    agent = DDPG(task,
                 params,
                 buffer_size = buffer_size,
                 batch_size = batch_size
                 )

    run_training(agent, task, params, num_episodes, file_output)
Esempio n. 9
0
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from agents.agent import DDPG
from tasks.takeoff import Task
from collections import deque
                                     # time limit of the episode
init_pose = np.array([0., 0., 10., 0., 0., 0.])  # initial pose
init_velocities = np.array([0., 0., 0.])         # initial velocities
init_angle_velocities = np.array([0., 0., 0.])   # initial angle velocities

num_episodes = 500
target_pos = np.array([0., 0., 100.])
task = Task(target_pos=target_pos, init_pose=init_pose, init_angle_velocities=init_angle_velocities, 
    init_velocities=init_velocities)
agent = DDPG(task) 
worst_score = float('inf')
best_score = float('-inf')

reward_labels = ['episode', 'reward', 'rolling10', 'rolling100']
reward_results = {x : [] for x in reward_labels}

rolling_score_10 = deque(maxlen=10)
rolling_score_100 = deque(maxlen=100)
for i_episode in range(1, num_episodes+1):
    state = agent.reset_episode() # start a new episode
    score = 0
    while True:
        action = agent.act(state) 
        next_state, reward, done = task.step(action)
        agent.step(action, reward, next_state, done)
Esempio n. 10
0
def main():
    labels = [
        'time', 'x', 'y', 'z', 'phi', 'theta', 'psi', 'x_velocity',
        'y_velocity', 'z_velocity', 'phi_velocity', 'theta_velocity',
        'psi_velocity', 'rotor_speed1', 'rotor_speed2', 'rotor_speed3',
        'rotor_speed4'
    ]
    file_output = 'data.txt'

    # write initial row
    with open(file_output, 'w') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(labels)

    num_episodes = 1000
    run_time = 10.
    target_pos = np.array([0., 0., 10.])  # takeoff and stay in place
    init_pose = np.array([0.0, 0.0, 1.0, 0.0, 0.0, 0.0])
    init_velocities = np.array([0.0, 0.0, 0.0])
    init_angle_velocities = np.array([0.0, 0.0, 0.0])

    task = TakeoffTask(init_pose=init_pose,
                       target_pos=target_pos,
                       runtime=run_time)
    agent = DDPG(task)

    best_score = -np.inf

    results_list = []
    rewards_list = []

    for i_episode in range(1, num_episodes + 1):
        state = agent.reset_episode()  # start a new episode
        count = 0
        total_reward = 0

        results = {x: [] for x in labels}
        rewards = []

        while True:
            action = agent.act(state)  # noise is added for exploration
            next_state, reward, done = task.step(action)

            total_reward += reward
            rewards.append(reward)

            agent.step(action, reward, next_state, done)
            state = next_state

            to_write = [task.sim.time] + list(task.sim.pose) + list(
                task.sim.v) + list(task.sim.angular_v) + list(action)
            for ii in range(len(labels)):
                results[labels[ii]].append(to_write[ii])

            write_to_csv(to_write)

            count += 1
            if done:
                score = total_reward / float(count) if count else 0.0

                results_list.append(results)
                rewards_list.append(rewards)

                if score > best_score:
                    best_score = score

                # plot every 200 episodes

                if i_episode % 200 == 0:
                    print('i should be plotting something now.')
                    print('episode {}'.format(i_episode))

                print(
                    "\rEpisode = {:4d}, score = {:7.3f}, best_score = {:7.3f}, reward for episode = {}"
                    .format(i_episode, score, best_score, total_reward),
                    end="")  # [debug]
                break
        sys.stdout.flush()
Esempio n. 11
0
MAX_STEPS = 700  # Maximum number of steps to run per episode
ACTOR_LR = 1e-4  # Actor network learning rate
CRITIC_LR = 1e-3  # Critic network learning rate
MU = 0.0  # Ornstein-uhlenbeck noise parameter
THETA = 0.15  # Ornstein-uhlenbeck noise parameter
SIGMA = 0.2  # Ornstein-uhlenbeck noise parameter
BUFFER_SIZE = 1000000  # Max size of the replay buffer
BATCH_SIZE = 128  # Number of samples to pick from replay buffer
GAMMA = 0.99  # Discount factor
TAU = 0.001  # Soft update to target network factor

# Create the environment
env = gym.make('BipedalWalker-v2')

# Create the agent
agent = DDPG(env, ACTOR_LR, CRITIC_LR, MU, THETA, SIGMA, BUFFER_SIZE,
             BATCH_SIZE, GAMMA, TAU)

# Reset the environment
S = env.reset()
rewards = []

# Train the DDPG agent in the environment
for episode in range(1, NUM_EPISODES + 1):
    state = agent.reset_episode()  # Start a new episode

    while True:
        env.render()

        # Perform the action given by the actor network + noise
        action = agent.act(state)
        next_state, reward, done, info = env.step(action)
Esempio n. 12
0
import threading
import queue
from PIL import Image
from yolo import YOLO
from agents.agent import DDPG

LOG_FILENAME = 'output.log'
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG)

# Speed of the drone
S = 60
# Frames per second of the pygame window display
FPS = 25

yolov3 = YOLO()
agent = DDPG()


class FrontEnd(object):
    """ Maintains the Tello display and moves it through the keyboard keys.
        Press escape key to quit.
        The controls are:
            - T: Takeoff
            - L: Land
            - Arrow keys: Forward, backward, left and right.
            - A and D: Counter clockwise and clockwise rotations
            - W and S: Up and down.
    """
    def __init__(self):
        # Init pygame
        pygame.init()