Esempio n. 1
0
def train_straight_DDPG(episodes, agent):
    # Currently we need one action output that will be
    # amount of acceleration of straight vehicle
    # Shape is the number of neural inputs or output
    action_space = 1
    state_space = 1

    # Currently we didn't resize the Radar data to (4, len(radar))
    # as we have to flatten it anyway
    radar_space = 600

    # Get the first state (speed, distance from junction)
    # Create model
    straight_model = md.DDPG(action_space, state_space, radar_space,
                             'Straight_Model')

    # Update rate of target
    tau = 0.005

    # To store reward history of each episode
    ep_reward_list = []
    # To store average reward history of last few episodes
    avg_reward_list = []
    # To store actor and critic loss
    actor_loss = []
    critic_loss = []

    for epi in range(episodes):
        try:
            radar_state_prev = agent.reset(False)
            time.sleep(1)
            radar_state_prev = np.reshape(radar_state_prev, (1, radar_space))

            start_state = [0]
            state = np.reshape(start_state, (1, state_space))
            score = 0
            max_step = 5_000

            actor_loss_epi = []
            critic_loss_epi = []
            length_epi = []
            for i in range(max_step):
                choice = straight_model.policy(radar_state_prev, state)
                action = choose_action_straight(choice)

                print(
                    f"action----{action}-----epsilon----{straight_model.epsilon}"
                )

                radar_state_next, next_state, reward, done, length_traversed = agent.step_straight(
                    action, 1)
                time.sleep(0.5)

                score += reward
                next_state = np.reshape(next_state, (1, state_space))
                straight_model.remember(radar_state_prev, radar_state_next,
                                        state, choice, reward, next_state,
                                        done)
                state = next_state
                radar_state_prev = np.reshape(radar_state_next,
                                              (1, radar_space))

                # This is back-prop, updating weights
                lossActor, lossCritic = straight_model.replay()
                actor_loss_epi.append(lossActor)
                critic_loss_epi.append(lossCritic)

                # Update the target model, we do it slowly as it keep things stable, SOFT VERSION
                straight_model.update_target(tau, epi)
                if done:
                    length_epi.append(length_traversed)
                    break

            actor_loss.append(np.mean(actor_loss_epi))
            critic_loss.append(np.mean(critic_loss_epi))

            # Will do a HARD update now, setting it to critic and actor, set tau=1
            straight_model.update_target(0.01, epi)

            ep_reward_list.append(score)
            print("\nepisode: {}/{}, score: {}".format(epi, episodes, score))

            avg_reward = np.mean(ep_reward_list[-AGGREGATE_STATS_EVERY:])
            avg_length = np.mean(length_epi[-AGGREGATE_STATS_EVERY:])
            print(
                "\nEpisode * {} * Avg Reward is ==> {} Avg Length is ==> {}\n".
                format(epi, avg_reward, avg_length))
            avg_reward_list.append(avg_reward)

            # Update log stats (every given number of episodes)
            min_reward = min(ep_reward_list[-AGGREGATE_STATS_EVERY:])
            max_reward = max(ep_reward_list[-AGGREGATE_STATS_EVERY:])
            # straight_model.tensorboard.update_stats(reward_avg=avg_reward, reward_min=min_reward, reward_max=max_reward, epsilon=straight_model.epsilon)
            straight_model.tensorboard.update_stats(
                reward_avg=[None, avg_reward],
                critic_loss=[None, np.mean(critic_loss_epi)],
                actor_loss=[None, np.mean(actor_loss_epi)],
                lenght_covered=[None, np.mean(avg_length)])

            if (epi % 100 == 0 and epi > 0):
                x_label = 'Episodes'
                y_label = 'Actor Loss'
                ut.plot(actor_loss, x_label, y_label, epi)
                time.sleep(1)
                y_label = 'Critic Loss'
                ut.plot(critic_loss, x_label, y_label, epi)
                time.sleep(1)

        finally:
            print(f"Task Completed! Episode {epi}")

            straight_model.save_model()
            if agent != None:
                agent.destroy()
                time.sleep(1)

    return actor_loss, critic_loss
Esempio n. 2
0
def train_straight_DDPG(episodes, agent):
    # Currently we need one action output that will be
    # amount of acceleration of straight vehicle
    action_space = 1
    state_space = 2
    # Get the first state (speed, distance from junction)
    # Create model
    straight_model = md.DDPG(action_space, state_space, 'Straight_Model')

    # Update rate of target
    tau = 0.01

    # To store reward history of each episode
    ep_reward_list = []
    # To store average reward history of last few episodes
    avg_reward_list = []
    # To store actor and critic loss
    actor_loss = []
    critic_loss = []

    for epi in range(episodes):
        try:
            agent.reset(False)
            time.sleep(1)

            start_state = [0, round(agent.get_location().x - 19, 4)]
            state = np.reshape(start_state, (1, 2))
            score = 0
            max_step = 1_000
            for i in range(max_step):
                choice = straight_model.policy(state)
                action = choose_action_straight(choice)

                p = 0
                if i % 10 == 0:
                    print(
                        f"action----{action}-----epsilon----{straight_model.epsilon}"
                    )
                    p = 1

                next_state, reward, done, _ = agent.step_straight(action, p)
                time.sleep(1)

                score += reward
                next_state = np.reshape(next_state, (1, 2))
                straight_model.remember(state, choice, reward, next_state,
                                        done)
                state = next_state

                # This is back-prop, updating weights
                lossActor, lossCritic = straight_model.replay()
                actor_loss.append(lossActor)
                critic_loss.append(lossCritic)

                # Update the target model, we do it slowly as it keep things stable
                straight_model.update_target(tau)
                if done:
                    break

            # Append episode reward to a list
            ep_reward_list.append(score)
            print("\nepisode: {}/{}, score: {}".format(epi, episodes, score))

            avg_reward = np.mean(ep_reward_list[-AGGREGATE_STATS_EVERY:])
            print("\nEpisode * {} * Avg Reward is ==> {}\n".format(
                epi, avg_reward))
            avg_reward_list.append(avg_reward)

            # Update log stats (every given number of episodes)
            if not epi % AGGREGATE_STATS_EVERY or epi == 1:
                min_reward = min(ep_reward_list[-AGGREGATE_STATS_EVERY:])
                max_reward = max(ep_reward_list[-AGGREGATE_STATS_EVERY:])
                straight_model.tensorboard.update_stats(
                    reward_avg=avg_reward,
                    reward_min=min_reward,
                    reward_max=max_reward,
                    epsilon=straight_model.epsilon)

        finally:
            print(f"Task Completed! Episode {epi}")

            straight_model.save_model()
            if agent != None:
                agent.destroy()
                time.sleep(3)

    return actor_loss, critic_loss
Esempio n. 3
0
def train_rightturn_DDPG(episodes, agent):
    # Two action choice for output
    # amount of acceleration of straight vehicle
    # Shape is the number of neural inputs or output
    action_space = 1
    state_space = 2

    radar_space = 400

    # Get the first state (speed, distance from junction)
    # Create model
    rightturn_model = md.DDPG(action_space, state_space, radar_space,
                              'Right_Turn_Model')

    # Update rate of target
    tau = 0.005

    # To store reward history of each episode
    ep_reward_list = []
    # To store average reward history of last few episodes
    avg_reward_list = []
    # To store actor and critic loss
    actor_loss = []
    critic_loss = []

    #For debugging the reward function
    epi_count = 150
    epirange = 200
    for epi in range(episodes):
        try:
            loc = random.randint(30, 130)
            print(f'--------Spawn Succeded RightTurn-----------')
            radar_state_prev = agent.reset(False, loc)
            radar_state_prev = np.reshape(radar_state_prev, (1, radar_space))
            start_state = [50, 90]
            state = np.reshape(start_state, (1, state_space))
            score = 0
            max_step = 5_00

            actor_loss_epi = []
            critic_loss_epi = []
            for i in range(max_step):
                choice = rightturn_model.policy(radar_state_prev, state)
                action = choose_action_rightturn(choice)
                # print(f'action1------------{action}')
                # if(epi>=epi_count and epi_count<epirange):
                # 	action =  choose_action_rightturn(0.2)
                # 	choice = 0.2
                print(
                    f'action----{action}-------epsilon----{rightturn_model.epsilon}'
                )
                radar_state_next, next_state, reward, done, _ = agent.step_rightturn(
                    action, 1)
                # print(f'next_state-----{next_state}-----reward---{next_state}----{done}')
                time.sleep(0.2)

                score += reward
                next_state = np.reshape(next_state, (1, state_space))
                rightturn_model.remember(radar_state_prev, radar_state_next,
                                         state, choice, reward, next_state,
                                         done)
                state = next_state
                radar_state_prev = np.reshape(radar_state_next,
                                              (1, radar_space))

                # This is back-prop, updating weights
                lossActor, lossCritic = rightturn_model.replay()
                actor_loss_epi.append(lossActor)
                critic_loss_epi.append(lossCritic)

                # Update the target model, we do it slowly as it keep things stable, SOFT VERSION
                rightturn_model.update_target(tau)

                if done:
                    break
            actor_loss.append(np.mean(actor_loss_epi))
            critic_loss.append(np.mean(critic_loss_epi))

            # Will do a HARD update now, setting it to critic and actor, set tau=1
            rightturn_model.update_target(0.01)

            ep_reward_list.append(score)
            print("\nepisode: {}/{}, score: {}".format(epi, episodes, score))

            avg_reward = np.mean(ep_reward_list[-AGGREGATE_STATS_EVERY:])
            print("\nEpisode * {} * Avg Reward is ==> {}\n".format(
                epi, avg_reward))
            avg_reward_list.append(avg_reward)

            # Update log stats (every given number of episodes)
            min_reward = min(ep_reward_list[-AGGREGATE_STATS_EVERY:])
            max_reward = max(ep_reward_list[-AGGREGATE_STATS_EVERY:])
            # straight_model.tensorboard.update_stats(reward_avg=avg_reward, reward_min=min_reward, reward_max=max_reward, epsilon=straight_model.epsilon)
            rightturn_model.tensorboard.update_stats(
                reward_avg=avg_reward,
                critic_loss=np.mean(critic_loss_epi),
                actor_loss=np.mean(actor_loss_epi))

            if (epi % 100 == 0 and epi > 1):
                x_label = 'Episodes'
                y_label = 'Actor Loss'
                ut.plot(actor_loss, x_label, y_label, epi)
                y_label = 'Critic Loss'
                ut.plot(critic_loss, x_label, y_label, epi)

            # # Average score of last 100 episode
            # if avg_reward > 500:
            # 	print('\n Task Completed! \n')
            # 	break

        finally:
            print(f"Task Completed! Episode {epi}")

            rightturn_model.save_model()
            if agent != None:
                agent.destroy()
                time.sleep(1)

    return actor_loss, critic_loss
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    state_dim = args.num_params
    action_dim = args.num_params
    max_action = 0.125

    kwargs = {
        "state_dim": state_dim,
        "action_dim": action_dim,
        "max_action": max_action,
        "discount": args.discount,
        "tau": args.tau,
    }

    policy = model.DDPG(**kwargs)

    if args.load_model != "":
        policy_file = file_name if args.load_model == "default" else args.load_model
        policy.load(f"./models/{policy_file}")

    replay_buffer = utils.ReplayBuffer(state_dim, action_dim)

    # Evaluate untrained policy
    evaluations = [eval_policy(model.DDPG, 4, './data/sound.wav', args.seed)]

    state, done = env.reset(), False
    episode_reward = 0
    episode_timesteps = 0
    episode_num = 0