def train_straight_DDPG(episodes, agent): # Currently we need one action output that will be # amount of acceleration of straight vehicle # Shape is the number of neural inputs or output action_space = 1 state_space = 1 # Currently we didn't resize the Radar data to (4, len(radar)) # as we have to flatten it anyway radar_space = 600 # Get the first state (speed, distance from junction) # Create model straight_model = md.DDPG(action_space, state_space, radar_space, 'Straight_Model') # Update rate of target tau = 0.005 # To store reward history of each episode ep_reward_list = [] # To store average reward history of last few episodes avg_reward_list = [] # To store actor and critic loss actor_loss = [] critic_loss = [] for epi in range(episodes): try: radar_state_prev = agent.reset(False) time.sleep(1) radar_state_prev = np.reshape(radar_state_prev, (1, radar_space)) start_state = [0] state = np.reshape(start_state, (1, state_space)) score = 0 max_step = 5_000 actor_loss_epi = [] critic_loss_epi = [] length_epi = [] for i in range(max_step): choice = straight_model.policy(radar_state_prev, state) action = choose_action_straight(choice) print( f"action----{action}-----epsilon----{straight_model.epsilon}" ) radar_state_next, next_state, reward, done, length_traversed = agent.step_straight( action, 1) time.sleep(0.5) score += reward next_state = np.reshape(next_state, (1, state_space)) straight_model.remember(radar_state_prev, radar_state_next, state, choice, reward, next_state, done) state = next_state radar_state_prev = np.reshape(radar_state_next, (1, radar_space)) # This is back-prop, updating weights lossActor, lossCritic = straight_model.replay() actor_loss_epi.append(lossActor) critic_loss_epi.append(lossCritic) # Update the target model, we do it slowly as it keep things stable, SOFT VERSION straight_model.update_target(tau, epi) if done: length_epi.append(length_traversed) break actor_loss.append(np.mean(actor_loss_epi)) critic_loss.append(np.mean(critic_loss_epi)) # Will do a HARD update now, setting it to critic and actor, set tau=1 straight_model.update_target(0.01, epi) ep_reward_list.append(score) print("\nepisode: {}/{}, score: {}".format(epi, episodes, score)) avg_reward = np.mean(ep_reward_list[-AGGREGATE_STATS_EVERY:]) avg_length = np.mean(length_epi[-AGGREGATE_STATS_EVERY:]) print( "\nEpisode * {} * Avg Reward is ==> {} Avg Length is ==> {}\n". format(epi, avg_reward, avg_length)) avg_reward_list.append(avg_reward) # Update log stats (every given number of episodes) min_reward = min(ep_reward_list[-AGGREGATE_STATS_EVERY:]) max_reward = max(ep_reward_list[-AGGREGATE_STATS_EVERY:]) # straight_model.tensorboard.update_stats(reward_avg=avg_reward, reward_min=min_reward, reward_max=max_reward, epsilon=straight_model.epsilon) straight_model.tensorboard.update_stats( reward_avg=[None, avg_reward], critic_loss=[None, np.mean(critic_loss_epi)], actor_loss=[None, np.mean(actor_loss_epi)], lenght_covered=[None, np.mean(avg_length)]) if (epi % 100 == 0 and epi > 0): x_label = 'Episodes' y_label = 'Actor Loss' ut.plot(actor_loss, x_label, y_label, epi) time.sleep(1) y_label = 'Critic Loss' ut.plot(critic_loss, x_label, y_label, epi) time.sleep(1) finally: print(f"Task Completed! Episode {epi}") straight_model.save_model() if agent != None: agent.destroy() time.sleep(1) return actor_loss, critic_loss
def train_straight_DDPG(episodes, agent): # Currently we need one action output that will be # amount of acceleration of straight vehicle action_space = 1 state_space = 2 # Get the first state (speed, distance from junction) # Create model straight_model = md.DDPG(action_space, state_space, 'Straight_Model') # Update rate of target tau = 0.01 # To store reward history of each episode ep_reward_list = [] # To store average reward history of last few episodes avg_reward_list = [] # To store actor and critic loss actor_loss = [] critic_loss = [] for epi in range(episodes): try: agent.reset(False) time.sleep(1) start_state = [0, round(agent.get_location().x - 19, 4)] state = np.reshape(start_state, (1, 2)) score = 0 max_step = 1_000 for i in range(max_step): choice = straight_model.policy(state) action = choose_action_straight(choice) p = 0 if i % 10 == 0: print( f"action----{action}-----epsilon----{straight_model.epsilon}" ) p = 1 next_state, reward, done, _ = agent.step_straight(action, p) time.sleep(1) score += reward next_state = np.reshape(next_state, (1, 2)) straight_model.remember(state, choice, reward, next_state, done) state = next_state # This is back-prop, updating weights lossActor, lossCritic = straight_model.replay() actor_loss.append(lossActor) critic_loss.append(lossCritic) # Update the target model, we do it slowly as it keep things stable straight_model.update_target(tau) if done: break # Append episode reward to a list ep_reward_list.append(score) print("\nepisode: {}/{}, score: {}".format(epi, episodes, score)) avg_reward = np.mean(ep_reward_list[-AGGREGATE_STATS_EVERY:]) print("\nEpisode * {} * Avg Reward is ==> {}\n".format( epi, avg_reward)) avg_reward_list.append(avg_reward) # Update log stats (every given number of episodes) if not epi % AGGREGATE_STATS_EVERY or epi == 1: min_reward = min(ep_reward_list[-AGGREGATE_STATS_EVERY:]) max_reward = max(ep_reward_list[-AGGREGATE_STATS_EVERY:]) straight_model.tensorboard.update_stats( reward_avg=avg_reward, reward_min=min_reward, reward_max=max_reward, epsilon=straight_model.epsilon) finally: print(f"Task Completed! Episode {epi}") straight_model.save_model() if agent != None: agent.destroy() time.sleep(3) return actor_loss, critic_loss
def train_rightturn_DDPG(episodes, agent): # Two action choice for output # amount of acceleration of straight vehicle # Shape is the number of neural inputs or output action_space = 1 state_space = 2 radar_space = 400 # Get the first state (speed, distance from junction) # Create model rightturn_model = md.DDPG(action_space, state_space, radar_space, 'Right_Turn_Model') # Update rate of target tau = 0.005 # To store reward history of each episode ep_reward_list = [] # To store average reward history of last few episodes avg_reward_list = [] # To store actor and critic loss actor_loss = [] critic_loss = [] #For debugging the reward function epi_count = 150 epirange = 200 for epi in range(episodes): try: loc = random.randint(30, 130) print(f'--------Spawn Succeded RightTurn-----------') radar_state_prev = agent.reset(False, loc) radar_state_prev = np.reshape(radar_state_prev, (1, radar_space)) start_state = [50, 90] state = np.reshape(start_state, (1, state_space)) score = 0 max_step = 5_00 actor_loss_epi = [] critic_loss_epi = [] for i in range(max_step): choice = rightturn_model.policy(radar_state_prev, state) action = choose_action_rightturn(choice) # print(f'action1------------{action}') # if(epi>=epi_count and epi_count<epirange): # action = choose_action_rightturn(0.2) # choice = 0.2 print( f'action----{action}-------epsilon----{rightturn_model.epsilon}' ) radar_state_next, next_state, reward, done, _ = agent.step_rightturn( action, 1) # print(f'next_state-----{next_state}-----reward---{next_state}----{done}') time.sleep(0.2) score += reward next_state = np.reshape(next_state, (1, state_space)) rightturn_model.remember(radar_state_prev, radar_state_next, state, choice, reward, next_state, done) state = next_state radar_state_prev = np.reshape(radar_state_next, (1, radar_space)) # This is back-prop, updating weights lossActor, lossCritic = rightturn_model.replay() actor_loss_epi.append(lossActor) critic_loss_epi.append(lossCritic) # Update the target model, we do it slowly as it keep things stable, SOFT VERSION rightturn_model.update_target(tau) if done: break actor_loss.append(np.mean(actor_loss_epi)) critic_loss.append(np.mean(critic_loss_epi)) # Will do a HARD update now, setting it to critic and actor, set tau=1 rightturn_model.update_target(0.01) ep_reward_list.append(score) print("\nepisode: {}/{}, score: {}".format(epi, episodes, score)) avg_reward = np.mean(ep_reward_list[-AGGREGATE_STATS_EVERY:]) print("\nEpisode * {} * Avg Reward is ==> {}\n".format( epi, avg_reward)) avg_reward_list.append(avg_reward) # Update log stats (every given number of episodes) min_reward = min(ep_reward_list[-AGGREGATE_STATS_EVERY:]) max_reward = max(ep_reward_list[-AGGREGATE_STATS_EVERY:]) # straight_model.tensorboard.update_stats(reward_avg=avg_reward, reward_min=min_reward, reward_max=max_reward, epsilon=straight_model.epsilon) rightturn_model.tensorboard.update_stats( reward_avg=avg_reward, critic_loss=np.mean(critic_loss_epi), actor_loss=np.mean(actor_loss_epi)) if (epi % 100 == 0 and epi > 1): x_label = 'Episodes' y_label = 'Actor Loss' ut.plot(actor_loss, x_label, y_label, epi) y_label = 'Critic Loss' ut.plot(critic_loss, x_label, y_label, epi) # # Average score of last 100 episode # if avg_reward > 500: # print('\n Task Completed! \n') # break finally: print(f"Task Completed! Episode {epi}") rightturn_model.save_model() if agent != None: agent.destroy() time.sleep(1) return actor_loss, critic_loss
torch.manual_seed(args.seed) np.random.seed(args.seed) state_dim = args.num_params action_dim = args.num_params max_action = 0.125 kwargs = { "state_dim": state_dim, "action_dim": action_dim, "max_action": max_action, "discount": args.discount, "tau": args.tau, } policy = model.DDPG(**kwargs) if args.load_model != "": policy_file = file_name if args.load_model == "default" else args.load_model policy.load(f"./models/{policy_file}") replay_buffer = utils.ReplayBuffer(state_dim, action_dim) # Evaluate untrained policy evaluations = [eval_policy(model.DDPG, 4, './data/sound.wav', args.seed)] state, done = env.reset(), False episode_reward = 0 episode_timesteps = 0 episode_num = 0