def multiple_runs(on): env = CarRacing() states = [] actions = [] for run in range(MAX_RUNS): state = env.reset() # done = False counter = 0 for game_time in range(MAX_GAME_TIME): # env.render() action = generate_action() state = _process_frame(state) states.append(state) actions.append(action) state, r, done, _ = env.step(action) # print(r) if counter == REST_NUM: print('RUN:{},GT:{},DATA:{}'.format(run, game_time, len(states))) position = np.random.randint(len(env.track)) env.car = Car(env.world, *env.track[position][1:4]) counter = 0 counter += 1 states = np.array(states, dtype=np.uint8) actions = np.array(actions, dtype=np.float16) save_name = 'rollout_v2_{}.npz'.format(on) # np.save(dst + '/' + save_name, frame_and_action) np.savez_compressed(dst + '/' + save_name, action=actions, state=states)
def simulate_batch(batch_num): env = CarRacing() obs_data = [] action_data = [] action = env.action_space.sample() for i_episode in range(_BATCH_SIZE): observation = env.reset() # Little hack to make the Car start at random positions in the race-track position = np.random.randint(len(env.track)) env.car = Car(env.world, *env.track[position][1:4]) observation = normalize_observation(observation) obs_sequence = [] for _ in range(_TIME_STEPS): if _RENDER: env.render() action = generate_action(action) observation, reward, done, info = env.step(action) observation = normalize_observation(observation) obs_data.append(observation) print("Saving dataset for batch {}".format(batch_num)) np.save('../data/obs_data_VAE_{}'.format(batch_num), obs_data) env.close()
def simulate_batch(batch_num): car_env = CarRacing() obs_data = [] action_data = [] action = car_env.action_space.sample() for item in range(batch_size): en_observ = car_env.reset() # this make car to start in random positions position = np.random.randint(len(car_env.track)) car_env.car = Car(car_env.world, *car_env.track[position][1:4]) en_observ = norm_obse(en_observ) obs_sequence = [] # time steps for i in range(steps): if render: car_env.render() action = create_action(action) en_observ, reward, done, info = car_env.step(action) en_observ = norm_obse(en_observ) obs_data.append(en_observ) print("Saving dataset for batch {}".format(batch_num)) np.save('data/TR_data_{}'.format(batch_num), obs_data) car_env.close()
def play(params): with torch.no_grad(): block_print() device = torch.device("cpu") vae_model = vae.ConvVAE(VAE_Z_SIZE, VAE_KL_TOLERANCE) if os.path.exists("checkpoints/vae_checkpoint.pth"): vae_model.load_state_dict( torch.load("checkpoints/vae_checkpoint.pth", map_location=device)) vae_model = vae_model.eval() vae_model.to(device) rnn_model = rnn.MDMRNN(MDN_NUM_MIXTURES, MDN_HIDDEN_SIZE, MDN_INPUT_SIZE, MDN_NUM_LAYERS, MDN_BATCH_SIZE, 1, MDN_OUTPUT_SIZE) if os.path.exists("checkpoints/rnn_checkpoint.pth"): rnn_model.load_state_dict( torch.load("checkpoints/rnn_checkpoint.pth", map_location=device)) rnn_model.to(device) rnn_model = rnn_model.eval() controller_model = controller.Controller(CMA_EMBEDDING_SIZE, CMA_NUM_ACTIONS, params) env = CarRacing() _NUM_TRIALS = 16 agent_reward = 0 for trial in range(_NUM_TRIALS): observation = env.reset() # Little hack to make the Car start at random positions in the race-track np.random.seed(int(str(time.time() * 1000000)[10:13])) position = np.random.randint(len(env.track)) env.car = Car(env.world, *env.track[position][1:4]) hidden_state, cell_state = train_rnn.init_hidden( MDN_NUM_LAYERS, MDN_BATCH_SIZE, MDN_HIDDEN_SIZE, device) total_reward = 0.0 steps = 0 while True: action, hidden_state, cell_state = decide_action( vae_model, rnn_model, controller_model, observation, hidden_state, cell_state, device) observation, r, done, info = env.step(action) total_reward += r # NB: done is not True after 1000 steps when using the hack above for # random init of position steps += 1 if steps == 999: break # If reward is out of scale, clip it total_reward = np.maximum(-100, total_reward) agent_reward += total_reward env.close() return -(agent_reward / _NUM_TRIALS)
def main(): print("Generating data for env CarRacing-v0") env = CarRacing() for obs_idx in range(1, 10): env.reset() observations = [] for i in range(1000): position = np.random.randint(len(env.track)) angle = np.random.randint(-20, 20) x_off = np.random.randint(-20, 20) init_data = list(env.track[position][1:4]) init_data[0] += angle init_data[1] += x_off env.car = Car(env.world, *init_data) observation = env.step(None)[0] cropped_obs = normalize_observation( observation[:CROP_SIZE, CROP_W_OFFSET:CROP_SIZE + CROP_W_OFFSET, :]) cropped_obs = cv2.resize(cropped_obs, dsize=(64, 64), interpolation=cv2.INTER_CUBIC).astype( np.float32) np.clip(cropped_obs, 0.0, 1.0, cropped_obs) if i % 10 == 0: print(i) if i % 100 == 0: plt.imshow(cropped_obs) plt.show() observations.append(cropped_obs) observations = np.array(observations, dtype=np.float32) if not os.path.exists("data"): os.mkdir("data") np.save("data/observations_%d.npy" % obs_idx, observations)
def multiple_runs(on): env = CarRacing() frame_and_action = [] for run in range(MAX_RUNS): env.reset() # done = False counter = 0 for game_time in range(MAX_GAME_TIME): # env.render() action = generate_action() state, r, done, _ = env.step(action) frame_and_action.append({'state': state, 'action': action}) # print(r) counter += 1 if counter > REST_NUM: print('RUN:{},GT:{},DATA:{}'.format(run, game_time, len(frame_and_action))) position = np.random.randint(len(env.track)) env.car = Car(env.world, *env.track[position][1:4]) counter = 0 save_name = 'rollout_{}.npy'.format(on) np.save(dst + '/' + save_name, frame_and_action)
def simulate_batch(batch_num, save=True, time_steps=None, reduce_size=True): env = CarRacing() if time_steps is None: time_steps = _TIME_STEPS obs_data = [] action_data = [] action = env.action_space.sample() for i_episode in range(_BATCH_SIZE): observation = env.reset() # Little hack to make the Car start at random positions in the race-track position = np.random.randint(len(env.track)) env.car = Car(env.world, *env.track[position][1:4]) observation = normalize_observation(observation, output_4d=False, reduce_size=reduce_size) obs_data.append(observation) for _ in range(time_steps): if _RENDER: env.render() action = generate_action(action) observation, reward, done, info = env.step(action) observation = normalize_observation(observation, output_4d=False, reduce_size=reduce_size) obs_data.append(observation) if save: print("Saving dataset for batch {:03d}".format(batch_num)) np.save('../data/obs_data_VAE_{:03d}'.format(batch_num), obs_data) env.close() return obs_data
def simulate_batch(batch_num): og = start = time.time() block_print() with torch.no_grad(): device = torch.device("cpu") vae_model = vae.ConvVAE(VAE_Z_SIZE, VAE_KL_TOLERANCE) if os.path.exists("checkpoints/vae_checkpoint.pth"): vae_model.load_state_dict( torch.load("checkpoints/vae_checkpoint.pth", map_location=device)) vae_model = vae_model.eval() vae_model.to(device) rnn_model = rnn.MDMRNN(MDN_NUM_MIXTURES, MDN_HIDDEN_SIZE, MDN_INPUT_SIZE, MDN_NUM_LAYERS, MDN_BATCH_SIZE, 1, MDN_OUTPUT_SIZE) if os.path.exists("checkpoints/rnn_checkpoint.pth"): rnn_model.load_state_dict( torch.load("checkpoints/rnn_checkpoint.pth", map_location=device)) rnn_model.to(device) rnn_model = rnn_model.eval() if os.path.exists("checkpoints/params.pkl"): fo = open('checkpoints/params.pkl', 'rb') params = pickle.load(fo) fo.close() print("Loaded existing params") else: cma_num_params = CMA_NUM_ACTIONS * CMA_EMBEDDING_SIZE + CMA_NUM_ACTIONS params = controller.get_random_model_params( cma_num_params, np.random.rand() * 0.01) controller_model = controller.Controller(CMA_EMBEDDING_SIZE, CMA_NUM_ACTIONS, params) env = CarRacing() observations = [] actions = [] observation = env.reset() position = np.random.randint(len(env.track)) env.car = Car(env.world, *env.track[position][1:4]) hidden_state, cell_state = train_rnn.init_hidden( MDN_NUM_LAYERS, MDN_BATCH_SIZE, MDN_HIDDEN_SIZE, device) observation = process_frame(observation) for _ in range(SEQUENCE_LENGTH + 1): observation = process_frame(observation) observations.append(observation) observation = normalize_observation(observation) observation = np.moveaxis(observation, 2, 0) observation = np.reshape(observation, (-1, 3, 64, 64)) observation = torch.tensor(observation, device=device) mu, log_var = vae_model.encode(observation) embedding = vae_model.reparameterize(mu, log_var) controller_input = torch.cat( (embedding, hidden_state.reshape(1, -1)), dim=1) action = controller_model.forward(controller_input) actions.append(action) observation, reward, done, info = env.step(action) action_tensor = torch.from_numpy(action).float().to(device) action_tensor = action_tensor.view(1, -1) rnn_inputs = torch.cat((embedding, action_tensor), dim=1) pi, mean, sigma, hidden_state, cell_state = rnn_model.forward( rnn_inputs, hidden_state, cell_state) observations = np.array(observations, dtype=np.uint8) actions = np.array(actions, dtype=np.float16) np.savez_compressed('data/obs_data_VAE_{}'.format(batch_num), obs=observations, action=actions) env.close() end = time.time() logging.info("_" + str(batch_num) + " Total: " + str(end - og))
def play(params, render=True, verbose=False, save_visualization=False, max_len=999): time_start = datetime.datetime.now() print('Agent train run begun ' + str(time_start)) sess, network = load_vae() env = CarRacing() # _NUM_TRIALS = 16 # <-- Ha and Schmidhuber _NUM_TRIALS = 8 agent_reward = 0 for trial in range(_NUM_TRIALS): observation = env.reset() observation = network.normalize_observation(observation) # Little hack to make the Car start at random positions in the race-track np.random.seed(int(str(time.time() * 1000000)[10:13])) position = np.random.randint(len(env.track)) env.car = Car(env.world, *env.track[position][1:4]) total_reward = 0.0 steps = 0 observations = [observation] while True: if render: env.render() observation = network.normalize_observation(observation) observations.append(observation) embedding = network.get_embedding(sess, observation) action = decide_action(sess, embedding, params) observation, r, done, info = env.step(action) total_reward += r # NB: done is not True after 1000 steps when using the hack above for # random init of position if verbose and (steps % 200 == 0 or steps == 999): print("\naction " + str(["{:+0.2f}".format(x) for x in action])) print("step {} total_reward {:+0.2f}".format( steps, total_reward)) steps += 1 if steps == max_len: break # if total_reward < -50: # break if _IS_TEST and steps > 10: break total_reward = np.maximum(-100, total_reward) agent_reward += total_reward if save_visualization: title = 'train_agent_r{:.2f}'.format(agent_reward) print('Saving trajectory:', title) network.show_pred(title, np.concatenate(observations, 0)) break print('.', end='') sess.close() env.close() print('Agent done - ' + str(time_start)) return -(agent_reward / _NUM_TRIALS)