def run_test_episode(agent : DDPG, task : Task, file_output):
    print('\nRunning test episode ...')

    labels = ['time', 'x', 'y', 'z', 'phi', 'theta', 'psi', 'x_velocity',
              'y_velocity', 'z_velocity', 'phi_velocity', 'theta_velocity',
              'psi_velocity', 'rotor_speed1', 'rotor_speed2', 'rotor_speed3', 'rotor_speed4' ,'reward']
    results = {x : [] for x in labels}

    aux_noise = copy.copy(agent.noise)
    agent.noise = OUNoise(agent.action_size, 0.0, 0.0, 0.0)

    state = agent.reset_episode() # start a new episode
    rewards_lists = defaultdict(list)
    print('state', state)
    print('state.shape', state.shape)

    # Run the simulation, and save the results.
    with open(file_output, 'w') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(labels)
        while True:
            rotor_speed = agent.act(state)
            rotor_speeds = np.array([rotor_speed ] *4)
            # rotor_speeds = [405]*4
            # rotor_speeds = [500, 490, 500, 500]
            next_state, reward, done, new_rewards = task.step(rotor_speeds)
            for key, value in new_rewards.items():
                rewards_lists[key].append(value)

            to_write = [task.sim.time] + list(task.sim.pose) + list(task.sim.v) + list(task.sim.angular_v) + list \
                (rotor_speeds) + [reward]
            for ii in range(len(labels)):
                results[labels[ii]].append(to_write[ii])
            writer.writerow(to_write)

            state = next_state


            if done:
                break

    # Restore noise
    agent.noise = copy.copy(aux_noise)

    print('Finished test episode!\n')
    return results, rewards_lists
Ejemplo n.º 2
0
def main(num_episodes: int = 200):

    target_pos = np.array([0., 0., 140.])
    task = Task(target_pos=target_pos)
    agent = DDPG(task)
    best_score = -1000
    best_x = 0
    best_y = 0
    best_z = 0
    best_episode = 0
    data = {}

    for i_episode in range(1, num_episodes + 1):
        state = agent.reset_episode()  # start a new episode
        score = 0

        while True:
            action = agent.act(state)
            next_state, reward, done = task.step(action)
            agent.step(action, reward, next_state, done)
            state = next_state
            score += reward

            if score > best_score:
                best_x = task.sim.pose[0]
                best_y = task.sim.pose[1]
                best_z = task.sim.pose[2]
                best_episode = i_episode
            best_score = max(score, best_score)
            data[i_episode] = {'Episode': i_episode, 'Reward': score, 'Action': action, 'Best_Score': best_score,
                               'x': task.sim.pose[0], 'y': task.sim.pose[1], 'z': task.sim.pose[2]}
            if done:
                print(
                    "\rEpisode = {:4d}, score = {:7.3f} (best = {:7.3f}), last_position = ({:5.1f},{:5.1f},{:5.1f}), best_position = ({:5.1f},{:5.1f},{:5.1f})".format(
                        i_episode, score, best_score, task.sim.pose[0], task.sim.pose[1], task.sim.pose[2], best_x, best_y,
                        best_z), end="")
                break
        sys.stdout.flush()
Ejemplo n.º 3
0
import gym
from pendulum_task import PendulumTask
from agents.agent import DDPG

task = PendulumTask()
agent = DDPG(task)
env = gym.make("Pendulum-v0")

done = False
n_episodes = 400
rewards = np.zeros(n_episodes)
for i in range(n_episodes):
    cur_state = env.reset()
    agent.reset_episode(cur_state)
    while True:
        env.render()
        random_action = env.action_space.sample()
        action = agent.act(cur_state)

        new_state, reward, done, _ = env.step(action)
        rewards[i] += reward

        #train step
        agent.step(action, reward, new_state, done)

        if done:
            print("\rEpisode = {:4d}, total_reward = {:7.3f}".format(
                i, rewards[i]))
            break
        else:
            cur_state = new_state
Ejemplo n.º 4
0
def drive(num_episodes=1000, sample_distance=100, task_renew_distance=100, target_pos=np.array([0., 0., 10.]), initial_pos=None, sample_cb=None, running_size=10):
    agent = DDPG()

    positions = []
    rewards = []
    initial_poss = []
    target_poss = []
    distances = []
    times = []
    running_reward = []
    running_time = []
    running_distance = []

    max_reward = -100000

    for i_episode in range(0, num_episodes):

        if i_episode % task_renew_distance == 0:
            epi_init_pos, task = new_task(initial_pos, target_pos)
            agent.new_task(task)

        state = agent.reset_episode()

        epi_positions = []
        epi_reward = 0
        epi_distances = []

        while True:
            action = agent.act(state) 
            next_state, reward, done = task.step(action)
            agent.step(action, reward, next_state, done)
            state = next_state
            epi_reward += reward

            epi_positions.append(task.sim.pose[:3])
            epi_distances.append(task.current_distance)

            if done:
                break

        avg_distance = np.average(epi_distances)

        print("\rEpisode = {:4d}, Reward = {:4n}, Avg Distance = {:4n}, time = {:4n}".format(i_episode + 1, epi_reward, avg_distance, task.sim.time), end="")

        rewards.append(epi_reward)
        distances.append(avg_distance)
        times.append(task.sim.time)

        if running_size < i_episode:
            running_reward.append(np.average(rewards[i_episode - running_size : i_episode]))
            running_time.append(np.average(times[i_episode - running_size : i_episode]))
            running_distance.append(np.average(distances[i_episode - running_size : i_episode]))
        else:
            running_reward.append(0)
            running_time.append(0)
            running_distance.append(0)

        positions.append(epi_positions)

        if i_episode % sample_distance == 0:
            max_reward = max([max_reward, epi_reward])
            initial_poss.append(epi_init_pos)
            target_poss.append(target_pos)

            if sample_cb is not None:
                sample_cb(epi_init_pos, positions, target_pos, rewards, distances, times, running_reward, running_time, running_distance)

            positions = []

        sys.stdout.flush()

    return epi_init_pos, positions, target_pos, rewards, distances, times, running_reward, running_time, running_distance
Ejemplo n.º 5
0
task = Task(target_pos=target_pos, init_pose=init_pose, init_angle_velocities=init_angle_velocities, 
    init_velocities=init_velocities)
agent = DDPG(task) 
worst_score = float('inf')
best_score = float('-inf')

reward_labels = ['episode', 'reward', 'rolling10', 'rolling100']
reward_results = {x : [] for x in reward_labels}

rolling_score_10 = deque(maxlen=10)
rolling_score_100 = deque(maxlen=100)
for i_episode in range(1, num_episodes+1):
    state = agent.reset_episode() # start a new episode
    score = 0
    while True:
        action = agent.act(state) 
        next_state, reward, done = task.step(action)
        agent.step(action, reward, next_state, done)
        state = next_state
        score += reward
        best_score = max(best_score , score)
        worst_score = min(worst_score , score)
        if done:
            rolling_score_10.append(score)
            rolling_score_100.append(score)
            print("\rEpisode = {:4d}, score = {:7.3f}, best = {:7.3f} , worst = {:7.3f}), rolling = {:7.3f}/{:7.3f}".format(
               i_episode, score, best_score, worst_score, np.mean(rolling_score_10), np.mean(rolling_score_100)), end="")
            break
    reward_results['rolling10'].append(np.mean(rolling_score_10))
    reward_results['rolling100'].append(np.mean(rolling_score_100))
    reward_results['episode'].append(i_episode)
Ejemplo n.º 6
0
def main():
    labels = [
        'time', 'x', 'y', 'z', 'phi', 'theta', 'psi', 'x_velocity',
        'y_velocity', 'z_velocity', 'phi_velocity', 'theta_velocity',
        'psi_velocity', 'rotor_speed1', 'rotor_speed2', 'rotor_speed3',
        'rotor_speed4'
    ]
    file_output = 'data.txt'

    # write initial row
    with open(file_output, 'w') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(labels)

    num_episodes = 1000
    run_time = 10.
    target_pos = np.array([0., 0., 10.])  # takeoff and stay in place
    init_pose = np.array([0.0, 0.0, 1.0, 0.0, 0.0, 0.0])
    init_velocities = np.array([0.0, 0.0, 0.0])
    init_angle_velocities = np.array([0.0, 0.0, 0.0])

    task = TakeoffTask(init_pose=init_pose,
                       target_pos=target_pos,
                       runtime=run_time)
    agent = DDPG(task)

    best_score = -np.inf

    results_list = []
    rewards_list = []

    for i_episode in range(1, num_episodes + 1):
        state = agent.reset_episode()  # start a new episode
        count = 0
        total_reward = 0

        results = {x: [] for x in labels}
        rewards = []

        while True:
            action = agent.act(state)  # noise is added for exploration
            next_state, reward, done = task.step(action)

            total_reward += reward
            rewards.append(reward)

            agent.step(action, reward, next_state, done)
            state = next_state

            to_write = [task.sim.time] + list(task.sim.pose) + list(
                task.sim.v) + list(task.sim.angular_v) + list(action)
            for ii in range(len(labels)):
                results[labels[ii]].append(to_write[ii])

            write_to_csv(to_write)

            count += 1
            if done:
                score = total_reward / float(count) if count else 0.0

                results_list.append(results)
                rewards_list.append(rewards)

                if score > best_score:
                    best_score = score

                # plot every 200 episodes

                if i_episode % 200 == 0:
                    print('i should be plotting something now.')
                    print('episode {}'.format(i_episode))

                print(
                    "\rEpisode = {:4d}, score = {:7.3f}, best_score = {:7.3f}, reward for episode = {}"
                    .format(i_episode, score, best_score, total_reward),
                    end="")  # [debug]
                break
        sys.stdout.flush()