def _enjoy(): # Launch the env with our helper function env = launch_env() print("Initialized environment") # Wrappers env = ResizeWrapper(env) env = NormalizeWrapper(env) env = ImgWrapper(env) # to make the images from 160x120x3 into 3x160x120 env = ActionWrapper(env) env = DtRewardWrapper(env) print("Initialized Wrappers") state_dim = env.observation_space.shape action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) # Initialize policy policy = DDPG(state_dim, action_dim, max_action, net_type="cnn") policy.load(filename='ddpg', directory='reinforcement/pytorch/models/') obs = env.reset() done = False while True: while not done: action = policy.predict(np.array(obs)) # Perform action obs, reward, done, _ = env.step(action) env.render() done = False obs = env.reset()
def _train(args): print("Running Expert for {} Episodes of {} Steps".format(args.episodes, args.steps)) print("Training Learning for {} Epochs with Batch Size of {}".format(args.epochs, args.batch_size)) env = launch_env() env = ResizeWrapper(env) env = NormalizeWrapper(env) env = ActionWrapper(env) env = DtRewardWrapper(env) print("Initialized Wrappers") observation_shape = (None, ) + env.observation_space.shape action_shape = (None, ) + env.action_space.shape # Create an imperfect demonstrator expert = PurePursuitExpert(env=env) observations = [] actions = [] # let's collect our samples for episode in range(0, args.episodes): print("Starting episode", episode) for steps in range(0, args.steps): # use our 'expert' to predict the next action. action = expert.predict(None) observation, reward, done, info = env.step(action) observations.append(observation) actions.append(action) env.reset() env.close() actions = np.array(actions) observations = np.array(observations) model = TensorflowModel( observation_shape=observation_shape, # from the logs we've got action_shape=action_shape, # same graph_location=args.model_directory, # where do we want to store our trained models seed=args.seed # to seed all random operations in the model (e.g., dropout) ) for i in range(args.epochs): # we defined the batch size, this can be adjusted according to your computing resources loss = None for batch in range(0, len(observations), args.batch_size): print("Training batch", batch) loss = model.train( observations=observations[batch:batch + args.batch_size], actions=actions[batch:batch + args.batch_size] ) # every 10 epochs, we store the model we have if i % 10 == 0: model.commit() print("Training complete!")
def initenv2(): env = launch_env2() env = ResizeWrapper(env) env = NormalizeWrapper(env) env = ImgWrapper(env) env = ActionWrapper(env) env = DtRewardWrapper(env) return env
def make_env(): # Launch the env with our helper function env = launch_env() print("Initialized environment") # Wrappers env = ResizeWrapper(env) env = NormalizeWrapper(env) env = ImgWrapper(env) # to make the images from 160x120x3 into 3x160x120 env = ActionWrapper(env) env = DtRewardWrapper(env) print("Initialized Wrappers") return env
def _enjoyWindow(): model = WindowModel(action_dim=2, max_action=1.) try: state_dict = torch.load('./models/windowimitate.pt') model.load_state_dict(state_dict) except: print('failed to load model') exit() model.eval().to(device) env = launch_env1() env = ResizeWrapper(env) env = NormalizeWrapper(env) env = ImgWrapper(env) env = ActionWrapper(env) env = DtRewardWrapper(env) obs = env.reset() obsWindow = np.zeros((12, 160, 120)) while True: obsWindow[:9, :, :] = obsWindow[3:, :, :] obsWindow[9:12, :, :] = obs obs = torch.from_numpy(obsWindow).float().to(device).unsqueeze(0) action = model(obs) action = action.squeeze().data.cpu().numpy() obs, reward, done, info = env.step(action) env.render() if done: if reward < 0: print('*** FAILED ***') time.sleep(0.7) obs = env.reset() env.render()
def _enjoy(): model = Model(action_dim=2, max_action=1.) try: state_dict = torch.load('trained_models/imitate.pt', map_location=device) model.load_state_dict(state_dict) except: print('failed to load model') exit() model.eval().to(device) env = launch_env() env = ResizeWrapper(env) env = NormalizeWrapper(env) env = ImgWrapper(env) env = ActionWrapper(env) env = DtRewardWrapper(env) obs = env.reset() while True: obs = torch.from_numpy(obs).float().to(device).unsqueeze(0) action = model(obs) action = action.squeeze().data.cpu().numpy() obs, reward, done, info = env.step(action) env.render() if done: if reward < 0: print('*** FAILED ***') time.sleep(0.7) obs = env.reset() env.render()
def extract(): env = launch_env() print("Initialized environment") # Wrappers env = ResizeWrapper(env) env = NormalizeWrapper(env) env = ImgWrapper(env) # to make the images from 160x120x3 into 3x160x120 env = ActionWrapper(env) env = DtRewardWrapper(env) print("Initialized Wrappers") state_dim = env.observation_space.shape action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) total_timesteps = 0 timesteps_since_eval = 0 episode_num = 0 done = True episode_reward = None env_counter = 0 reward = 0 episode_timesteps = 0 obs = env.reset() print("Starting training") while total_timesteps < 2: action = env.action_space.sample() # Perform action new_obs, reward, done, _ = env.step(action) obs = new_obs print(obs) print(action) total_timesteps += 1
def _enjoy(args): # Launch the env with our helper function env = launch_env() print("Initialized environment") # Wrappers env = ResizeWrapper(env) env = GrayscaleWrapper(env) env = NormalizeWrapper(env) env = FrameStack(env, 4) env = DtRewardWrapper(env) env = ActionWrapper(env) print("Initialized Wrappers") state_dim = env.observation_space.shape action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) # Initialize policy # policy = TD3(state_dim, action_dim, max_action, net_type="cnn") # policy.load(filename=args.policy, directory='reinforcement/pytorch/models/') policy = policies[args.policy](state_dim, action_dim, max_action) policy.load("reinforcement/pytorch/models/", args.policy) obs = env.reset() done = False while True: while not done: action = policy.predict(np.array(obs)) # Perform action obs, reward, done, _ = env.step(action) env.render() done = False obs = env.reset()
def initWindowModel(): model = WindowModel(action_dim=2, max_action=1.) try: state_dict = torch.load('./models/windowimitate.pt') model.load_state_dict(state_dict) except: print('failed to load model') exit() model.eval().to(device) env = launch_env1() env = ResizeWrapper(env) env = NormalizeWrapper(env) env = ImgWrapper(env) env = ActionWrapper(env) env = DtRewardWrapper(env) env.reset() env.render() return env, model
def _train(args): if not os.path.exists("./results"): os.makedirs("./results") if not os.path.exists(args.model_dir): os.makedirs(args.model_dir) # Launch the env with our helper function env = launch_env() print("Initialized environment") # Wrappers env = ResizeWrapper(env) env = NormalizeWrapper(env) env = ImgWrapper(env) # to make the images from 160x120x3 into 3x160x120 env = ActionWrapper(env) env = DtRewardWrapper(env) print("Initialized Wrappers") # Set seeds seed(args.seed) state_dim = env.observation_space.shape action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) # Initialize policy policy = DDPG(state_dim, action_dim, max_action, net_type="cnn") replay_buffer = ReplayBuffer(args.replay_buffer_max_size) print("Initialized DDPG") # Evaluate untrained policy evaluations = [evaluate_policy(env, policy)] total_timesteps = 0 timesteps_since_eval = 0 episode_num = 0 done = True episode_reward = None env_counter = 0 reward = 0 episode_timesteps = 0 print("Starting training") while total_timesteps < args.max_timesteps: print("timestep: {} | reward: {}".format(total_timesteps, reward)) if done: if total_timesteps != 0: print( ("Total T: %d Episode Num: %d Episode T: %d Reward: %f") % (total_timesteps, episode_num, episode_timesteps, episode_reward)) policy.train(replay_buffer, episode_timesteps, args.batch_size, args.discount, args.tau) # Evaluate episode if timesteps_since_eval >= args.eval_freq: timesteps_since_eval %= args.eval_freq evaluations.append(evaluate_policy(env, policy)) print("rewards at time {}: {}".format( total_timesteps, evaluations[-1])) if args.save_models: policy.save(filename='{}_{}'.format( 'ddpg', total_timesteps), directory=args.model_dir) np.savez("./results/rewards.npz", evaluations) # Reset environment env_counter += 1 obs = env.reset() done = False episode_reward = 0 episode_timesteps = 0 episode_num += 1 # Select action randomly or according to policy if total_timesteps < args.start_timesteps: action = env.action_space.sample() else: action = policy.predict(np.array(obs)) if args.expl_noise != 0: action = (action + np.random.normal( 0, args.expl_noise, size=env.action_space.shape[0])).clip( env.action_space.low, env.action_space.high) # Perform action new_obs, reward, done, _ = env.step(action) if episode_timesteps >= args.env_timesteps: done = True done_bool = 0 if episode_timesteps + 1 == args.env_timesteps else float( done) episode_reward += reward # Store data in replay buffer replay_buffer.add(obs, new_obs, action, reward, done_bool) obs = new_obs episode_timesteps += 1 total_timesteps += 1 timesteps_since_eval += 1 print("Training done, about to save..") policy.save(filename='ddpg', directory=args.model_dir) print("Finished saving..should return now!")
def _train(args): env = launch_env() env = ResizeWrapper(env) env = NormalizeWrapper(env) env = ImgWrapper(env) env = DtRewardWrapper(env) env = MetricsWrapper(env) env = ActionWrapper(env) print("Initialized Wrappers") observation_shape = (None, ) + env.observation_space.shape action_shape = (None, ) + env.action_space.shape # Create an imperfect demonstrator expert = PurePursuitExpert(env=env) observations = [] actions = [] # let's collect our samples for episode in range(0, args.episodes): print("Starting episode", episode) for steps in range(0, args.steps): # use our 'expert' to predict the next action. action = expert.predict(None) observation, reward, done, info = env.step(action) observations.append(observation) actions.append(action) env.reset() env.close() actions = np.array(actions) observations = np.array(observations) model = Model(action_dim=2, max_action=1.) model.train().to(device) # weight_decay is L2 regularization, helps avoid overfitting optimizer = optim.SGD(model.parameters(), lr=0.0004, weight_decay=1e-3) avg_loss = 0 for epoch in range(args.epochs): optimizer.zero_grad() batch_indices = np.random.randint(0, observations.shape[0], (args.batch_size)) obs_batch = torch.from_numpy( observations[batch_indices]).float().to(device) act_batch = torch.from_numpy(actions[batch_indices]).float().to(device) model_actions = model(obs_batch) loss = (model_actions - act_batch).norm(2).mean() loss.backward() optimizer.step() loss = loss.data[0] avg_loss = avg_loss * 0.995 + loss * 0.005 print('epoch %d, loss=%.3f' % (epoch, avg_loss)) # Periodically save the trained model if epoch % 200 == 0: torch.save(model.state_dict(), 'imitation/pytorch/models/imitate.pt')
def _dagger(): model = Model(action_dim=2, max_action=1.) try: state_dict = torch.load('./models/imitate.pt') model.load_state_dict(state_dict) except: print('failed to load model') exit() model.eval().to(device) env = launch_env1() # Register a keyboard handler env = ResizeWrapper(env) env = NormalizeWrapper(env) env = ImgWrapper(env) env = ActionWrapper(env) env = DtRewardWrapper(env) obs = env.reset() env.render() key_handler = key.KeyStateHandler() env.unwrapped.window.push_handlers(key_handler) print(env.map_name) raise Exception("asdfsadf") obsHistory = [] actionHistory = [] while True: obs = torch.from_numpy(obs).float().to(device).unsqueeze(0) action = model(obs) action = action.squeeze().data.cpu().numpy() obs, reward, done, info = env.step(action) print(key_handler) daggerAction = np.array([0.0, 0.0]) if key_handler[key.UP]: print("as===as=df=sad=f=asdf=sad=fs=adf") daggerAction = np.array([1.00, 0.0]) #action = np.array([0.44, 0.0]) if key_handler[key.DOWN]: print("as===as=df=sad=f=asdf=sad=fs=adf") daggerAction = np.array([-1.00, 0]) #action = np.array([-0.44, 0]) if key_handler[key.LEFT]: print("as===as=df=sad=f=asdf=sad=fs=adf") daggerAction = np.array([0.35, +1]) if key_handler[key.RIGHT]: print("as===as=df=sad=f=asdf=sad=fs=adf") daggerAction = np.array([0.35, -1]) if key_handler[key.SPACE]: obsHistoryArray = np.array(obsHistory) actionHistoryArray = np.array(actionHistory) np.save('./dagger/obs_{}.npy'.format(len(count)), obsHistoryArray) np.save('./dagger/actions_{}.npy'.format(len(count)), actionHistoryArray) print(daggerAction) obsHistory.append(obs) actionHistory.append(daggerAction) env.render() if done: if reward < 0: print('*** FAILED ***') time.sleep(0.7) obs = env.reset() env.render()
def _train(args): env = launch_env1() env1 = ResizeWrapper(env) env2 = NormalizeWrapper(env) env3 = ImgWrapper(env) env = ActionWrapper(env) env = DtRewardWrapper(env) print("Initialized Wrappers") def transformObs(obs): obs = env1.observation(obs) obs = env2.observation(obs) obs = env3.observation(obs) return obs actions = None rawObs = None for map in MAP_NAMES: if map == "loop_obstacles": episodes = 3 else: episodes = 2 print(map) for episode in range(episodes): actionFile = "actions_{}.npy".format(episode) action = np.load(TRAINING_DATA_PATH.format(map, actionFile)) print(action.shape) observationFile = "obs_{}.npy".format(episode) observation = np.load(TRAINING_DATA_PATH.format(map, observationFile)) if actions is None: actions = action rawObs = observation else: actions = np.concatenate((actions, action), axis=0) rawObs = np.concatenate((rawObs, observation), axis=0) print(actions.shape) print(actions.shape) print("---") observations = np.zeros((rawObs.shape[0], 3, 160, 120)) for i, obs in enumerate(rawObs): observations[i] = transformObs(obs) ''' # Create an imperfect demonstrator expert = PurePursuitExpert(env=env) observations = [] actions = [] # let's collect our samples for episode in range(0, 2): #for episode in range(0, args.episodes): print("Starting episode", episode) #for steps in range(0, args.steps): for steps in range(0, 4): # use our 'expert' to predict the next action. action = expert.predict(None) observation, reward, done, info = env.step(action) observations.append(observation) actions.append(action) env.reset() actions = np.array(actions) observations = np.array(observations) print(observations.shape) ''' env.close() #raise Exception("Done with testing") model = Model(action_dim=2, max_action=1.) model.train().to(device) # weight_decay is L2 regularization, helps avoid overfitting optimizer = optim.SGD( model.parameters(), lr=0.0004, weight_decay=1e-3 ) loss_list = [] avg_loss = 0 for epoch in range(args.epochs): optimizer.zero_grad() batch_indices = np.random.randint(0, observations.shape[0], (args.batch_size)) obs_batch = torch.from_numpy(observations[batch_indices]).float().to(device) act_batch = torch.from_numpy(actions[batch_indices]).long().to(device) model_actions = model(obs_batch) loss = (model_actions - act_batch).norm(2).mean() loss.backward() optimizer.step() #loss = loss.data[0] loss = loss.item() avg_loss = avg_loss * 0.995 + loss * 0.005 print('epoch %d, loss=%.3f' % (epoch, loss)) loss_list.append(loss) # Periodically save the trained model if epoch % 50 == 0: print("Saving...") torch.save(model.state_dict(), 'imitation/pytorch/models/imitate.pt') save_loss(loss_list, 'imitation/pytorch/loss.npy') print("Saving...") torch.save(model.state_dict(), 'imitation/pytorch/models/imitate.pt')
def _train(args): if not os.path.exists("./results"): os.makedirs("./results") if not os.path.exists(args.model_dir): os.makedirs(args.model_dir) # Launch the env with our helper function env = launch_env() print("Initialized environment") # Wrappers env = ResizeWrapper(env) env = GrayscaleWrapper(env) env = NormalizeWrapper(env) env = FrameStack(env, 4) env = DtRewardWrapper(env) env = ActionWrapper(env) print("Initialized Wrappers") # Set seeds seed(args.seed) state_dim = env.observation_space.shape action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) # Init training data total_timesteps = 0 timesteps_since_eval = 0 episode_num = 0 episode_reward = 0 env_counter = 0 reward = 0 episode_timesteps = 0 avg_episodes = 100 # Keep track of the best reward over time best_reward = -np.inf # Keep track of train_rewards train_rewards = [] # To print mean actions per episode mean_action = [] # To keep track of moving averages moving_avgs = [] # Summary writer for tensorboard writer = SummaryWriter(log_dir="reinforcement/pytorch/runs") # Initialize policy if args.policy not in policies: raise ValueError( "Policy {} is not available, chose one of : {}".format( args.policy, list(policies.keys()))) policy = policies[args.policy](state_dim, action_dim, max_action, args.per, args.gradclip) # Evaluate untrained policy evaluations = [evaluate_policy(env, policy)] moving_avgs.append(evaluations[0]) writer.add_scalar("Timesteps/EvaluationReward", evaluations[0], total_timesteps) ## Initialize ReplayBuffer if args.per: print("Training with Prioritized Experience Reply") replay_buffer = PrioritizedReplayBuffer( args.replay_buffer_max_size, args.batch_size, args.seed, initial_beta=0.5, delta_beta=2 / args.max_timesteps, ) else: replay_buffer = ReplayBuffer(args.replay_buffer_max_size, args.batch_size, args.seed) # Load previous policy if args.load_initial_policy: # Disable random start steps args.start_timesteps = 0 # Load training data checkpoint = load_training_state(args.model_dir, args.policy + "_training") evaluations = checkpoint["evaluations"] total_timesteps = checkpoint["total_timesteps"] train_rewards = checkpoint["train_rewards"] episode_num = checkpoint["episode_num"] best_reward = checkpoint["best_reward"] moving_avgs = checkpoint["moving_avgs"] # Load policy policy.load(args.model_dir, args.policy) print("Starting training") obs = env.reset() while total_timesteps < args.max_timesteps: # Select action if total_timesteps < args.start_timesteps: action = env.action_space.sample() else: action = policy.predict(np.array(obs)) action = add_noise(action, args.expl_noise, env.action_space.low, env.action_space.high) mean_action.append(action) # Perform action new_obs, reward, done, _ = env.step(action) # Update episode reward episode_reward += reward # Store data in replay buffer replay_buffer.add(obs, action, reward, new_obs, float(done)) # Update network if len(replay_buffer) >= args.batch_size: policy.update(replay_buffer, args.discount, args.tau) # Update env obs = new_obs episode_timesteps += 1 total_timesteps += 1 timesteps_since_eval += 1 if episode_timesteps >= args.env_timesteps: done = True if done: print(( "Total T: %d Episode Num: %d \nMean actions: %.2f %.2f Episode T: %d Reward: %.1f Moving Average: %.1f" ) % ( total_timesteps, episode_num, np.mean(np.array(mean_action), axis=0)[0], np.mean(np.array(mean_action), axis=0)[1], episode_timesteps, episode_reward, moving_avgs[-1], )) train_rewards.append(episode_reward) moving_avgs.append(moving_average(train_rewards, avg_episodes)) writer.add_scalar("Timesteps/Rewards", episode_reward, total_timesteps) writer.add_scalar("Timesteps/MovingAverage", moving_avgs[-1], total_timesteps) # writer.add_scalar("Episode/Wheel1Mean", np.mean(np.array(mean_action), axis=0)[0], episode_num) # writer.add_scalar("Episode/Wheel2Mean", np.mean(np.array(mean_action), axis=0)[1], episode_num) # Evaluate episode if timesteps_since_eval >= args.eval_freq: timesteps_since_eval %= args.eval_freq eval_reward = evaluate_policy(env, policy) evaluations.append(eval_reward) writer.add_scalar("Timesteps/EvaluationReward", eval_reward, total_timesteps) print( "\n-+-+-+-+-+-+-+-+-+-+ Evaluation reward at time {}: {} +-+-+-+-+-+-+-+-+-+-" .format(total_timesteps, eval_reward)) np.savetxt( "reinforcement/pytorch/results/eval_rewards_" + args.policy + ".csv", np.array(evaluations), delimiter=",", ) np.savetxt( "reinforcement/pytorch/results/train_rewards_" + args.policy + ".csv", np.array(train_rewards), delimiter=",", ) np.savetxt( "reinforcement/pytorch/results/moving_averages_" + args.policy + ".csv", np.array(moving_avgs), delimiter=",", ) # Save the policy according to the best reward over training if eval_reward > best_reward: best_reward = eval_reward policy.save(args.model_dir, args.policy) save_training_state( args.model_dir, args.policy + "_training", best_reward, total_timesteps, evaluations, train_rewards, episode_num, moving_avgs, ) print( "-+-+-+-+-+-+-+-+-+-+ Model saved +-+-+-+-+-+-+-+-+-+-\n" ) # Reset environment mean_action = [] obs = env.reset() env_counter += 1 episode_reward = 0 episode_timesteps = 0 episode_num += 1 print("Finished..should return now!")