def run(agent_spec, callback): AGENT_ID = "Agent-007" env = gym.make( "smarts.env:hiway-v0", scenarios=["scenarios/intersections/2lane"], agent_specs={AGENT_ID: agent_spec}, headless=True, timestep_sec=0.01, seed=42, ) i = 0 for episode in episodes(n=EPISODE_COUNT): agent = agent_spec.build_agent() observations = env.reset() episode.record_scenario(env.scenario_log) dones = {"__all__": False} while not dones["__all__"]: agent_obs = observations[AGENT_ID] agent_action = agent.act(agent_obs) observations, rewards, dones, infos = env.step({AGENT_ID: agent_action}) episode.record_step(observations, rewards, dones, infos) if i % CAPTURE_STEP == 0: callback(rewards, agent_obs, dones, int(i / CAPTURE_STEP)) i += 1 env.close()
def main(scenarios, sim_name, headless, num_episodes, seed, max_episode_steps=None): env = gym.make( "smarts.env:hiway-v0", scenarios=scenarios, agent_specs={}, sim_name=sim_name, headless=headless, sumo_headless=True, visdom=False, seed=seed, timestep_sec=0.1, ) if max_episode_steps is None: max_episode_steps = 1000 for episode in episodes(n=num_episodes): env.reset() episode.record_scenario(env.scenario_log) for _ in range(max_episode_steps): env.step({}) episode.record_step({}, {}, {}, {}) env.close()
def main(scenarios, headless, num_episodes, max_episode_steps=None): agent_spec = AgentSpec( interface=AgentInterface.from_type( AgentType.LanerWithSpeed, max_episode_steps=max_episode_steps ), agent_builder=ChaseViaPointsAgent, ) env = gym.make( "smarts.env:hiway-v0", scenarios=scenarios, agent_specs={"SingleAgent": agent_spec}, headless=headless, sumo_headless=True, ) # Convert `env.step()` and `env.reset()` from multi-agent interface to # single-agent interface. env = SingleAgent(env=env) for episode in episodes(n=num_episodes): agent = agent_spec.build_agent() observation = env.reset() episode.record_scenario(env.scenario_log) done = False while not done: agent_action = agent.act(observation) observation, reward, done, info = env.step(agent_action) episode.record_step(observation, reward, done, info) env.close()
def test_social_agents(env, agent_spec): for episode in episodes(n=MAX_EPISODES): agent = agent_spec.build_agent() observations = env.reset() episode.record_scenario(env.scenario_log) dones = {"__all__": False} while not dones["__all__"]: obs = observations[AGENT_ID] observations, rewards, dones, infos = env.step({AGENT_ID: agent.act(obs)}) episode.record_step(observations, rewards, dones, infos) assert SOCIAL_AGENT_ID not in observations assert SOCIAL_AGENT_ID not in dones # Reward is currently the delta in distance travelled by this agent. # We want to make sure that this is infact a delta and not total distance # travelled since this bug has appeared a few times. # # The way to verify this is by making sure the reward does not grow without bounds assert -3 < rewards[AGENT_ID] < 3 assert episode.index == ( MAX_EPISODES - 1 ), "Simulation must cycle through to the final episode"
def train(args, agent_specs, eval_interval: int = None): scenario = Path(args.scenario).absolute() eval_scenario = Path(args.eval_scenario).absolute() env = gym.make( "smarts.env:hiway-v0", scenarios=[scenario], agent_specs=agent_specs, headless=False, visdom=False, timestep_sec=0.1, ) agents = {_id: agent_spec.build_agent() for _id, agent_spec in agent_specs.items()} for episode in episodes(n=50): observations = env.reset() episode.record_scenario(env.scenario_log) dones = {"__all__": False} while not dones["__all__"]: agent_actions = { _id: agents[_id].act(obs) for _id, obs in observations.items() } observations, rewards, dones, infos = env.step(agent_actions) episode.record_step(observations, rewards, dones, infos) if eval_interval and episode.index % eval_interval == 0: # Block for evaluation ray.wait([evaluate.remote(episode.index, eval_scenario, agent_specs)]) # Optionally, instead, you can run your evaluation concurrently by omitting the `ray.wait([..])`. # # evaluate.remote(episode.index, args.eval_scenario, agent) env.close()
def main(scenarios, sim_name, headless, num_episodes, seed, max_episode_steps=None): agent_spec = AgentSpec( interface=AgentInterface.from_type( AgentType.LanerWithSpeed, max_episode_steps=max_episode_steps ), agent_builder=ChaseViaPointsAgent, ) env = gym.make( "smarts.env:hiway-v0", scenarios=scenarios, agent_specs={AGENT_ID: agent_spec}, sim_name=sim_name, headless=headless, visdom=False, timestep_sec=0.1, sumo_headless=True, seed=seed, # zoo_addrs=[("10.193.241.236", 7432)], # Sample server address (ip, port), to distribute social agents in remote server. # envision_record_data_replay_path="./data_replay", ) for episode in episodes(n=num_episodes): agent = agent_spec.build_agent() observations = env.reset() episode.record_scenario(env.scenario_log) dones = {"__all__": False} while not dones["__all__"]: agent_obs = observations[AGENT_ID] agent_action = agent.act(agent_obs) observations, rewards, dones, infos = env.step({AGENT_ID: agent_action}) episode.record_step(observations, rewards, dones, infos) env.close()
def main( scenarios, headless, num_episodes, seed, ): agent_spec = AgentSpec( interface=AgentInterface.from_type( AgentType.StandardWithAbsoluteSteering, max_episode_steps=3000), policy_builder=HumanKeyboardPolicy, ) env = gym.make( "smarts.env:hiway-v0", scenarios=scenarios, agent_specs={AGENT_ID: agent_spec}, headless=headless, timestep_sec=0.1, seed=seed, ) for episode in episodes(n=num_episodes): agent = agent_spec.build_agent() observations = env.reset() episode.record_scenario(env.scenario_log) dones = {"__all__": False} while not dones["__all__"]: agent_obs = observations[AGENT_ID] agent_action = agent.act(agent_obs) observations, rewards, dones, infos = env.step( {AGENT_ID: agent_action}) episode.record_step(observations, rewards, dones, infos) env.close()
def test_hiway_env(env, agent_spec): for episode in episodes(n=MAX_EPISODES): agent = agent_spec.build_agent() observations = env.reset() episode.record_scenario(env.scenario_log) dones = {"__all__": False} while not dones["__all__"]: obs = observations[AGENT_ID] observations, rewards, dones, infos = env.step({AGENT_ID: agent.act(obs)}) episode.record_step(observations, rewards, dones, infos) assert ( OBSERVATION_EXPECTED in observations[AGENT_ID] ), "Failed to apply observation adapter" assert ( REWARD_EXPECTED == rewards[AGENT_ID] ), "Failed to apply reward adapter" assert INFO_EXTRA_KEY in infos[AGENT_ID], "Failed to apply info adapter" assert episode.index == ( MAX_EPISODES - 1 ), "Simulation must cycle through to the final episode."
def run(agent_spec, callback, scenarios, episode_count, capture_step): AGENT_ID = "Agent-007" env = gym.make( "smarts.env:hiway-v0", scenarios=[scenarios], agent_specs={AGENT_ID: agent_spec}, headless=True, fixed_timestep_sec=0.01, seed=42, ) i = 0 for episode in episodes(n=episode_count): agent = agent_spec.build_agent() observations = env.reset() episode.record_scenario(env.scenario_log) dones = {"__all__": False} while not dones["__all__"]: agent_obs = observations[AGENT_ID] agent_action = agent.act(agent_obs) observations, rewards, dones, infos = env.step({AGENT_ID: agent_action}) episode.record_step(observations, rewards, dones, infos) if i % capture_step == 0: callback(rewards, agent_obs, dones, int(i / capture_step)) i += 1 env.close()
def main(scenarios, sim_name, headless, num_episodes, seed): open_agent_spec = open_agent.entrypoint(debug=False, aggressiveness=3) env = gym.make( "smarts.env:hiway-v0", scenarios=scenarios, agent_specs={AGENT_ID: open_agent_spec}, sim_name=sim_name, headless=headless, visdom=False, timestep_sec=0.1, sumo_headless=True, seed=seed, # envision_record_data_replay_path="./data_replay", ) for episode in episodes(n=num_episodes): agent = open_agent_spec.build_agent() observations = env.reset() episode.record_scenario(env.scenario_log) dones = {"__all__": False} while not dones["__all__"]: agent_obs = observations[AGENT_ID] agent_action = agent.act(agent_obs) observations, rewards, dones, infos = env.step( {AGENT_ID: agent_action}) episode.record_step(observations, rewards, dones, infos) del agent env.close()
def main(scenarios, headless, num_episodes, seed): agent_spec = AgentSpec( interface=AgentInterface.from_type(AgentType.Laner, max_episode_steps=None), agent_builder=KeepLaneAgent, ) env = gym.make( "smarts.env:hiway-v0", scenarios=scenarios, agent_specs={AGENT_ID: agent_spec}, headless=headless, visdom=False, timestep_sec=0.1, sumo_headless=True, seed=seed, # envision_record_data_replay_path="./data_replay", ) for episode in episodes(n=num_episodes): agent = agent_spec.build_agent() observations = env.reset() episode.record_scenario(env.scenario_log) dones = {"__all__": False} while not dones["__all__"]: agent_obs = observations[AGENT_ID] agent_action = agent.act(agent_obs) observations, rewards, dones, infos = env.step({AGENT_ID: agent_action}) episode.record_step(observations, rewards, dones, infos) env.close()
def train(training_scenarios, evaluation_scenarios, sim_name, headless, num_episodes, seed): agent_params = {"input_dims": 4, "hidden_dims": 7, "output_dims": 3} agent_spec = AgentSpec( interface=AgentInterface.from_type(AgentType.Standard, max_episode_steps=5000), agent_params=agent_params, agent_builder=PyTorchAgent, observation_adapter=observation_adapter, ) env = gym.make( "smarts.env:hiway-v0", scenarios=training_scenarios, agent_specs={AGENT_ID: agent_spec}, sim_name=sim_name, headless=headless, fixed_timestep_sec=0.1, seed=seed, ) steps = 0 for episode in episodes(n=num_episodes): agent = agent_spec.build_agent() observations = env.reset() episode.record_scenario(env.scenario_log) dones = {"__all__": False} while not dones["__all__"]: agent_obs = observations[AGENT_ID] agent_action = agent.act(agent_obs) observations, rewards, dones, infos = env.step( {AGENT_ID: agent_action}) episode.record_step(observations, rewards, dones, infos) steps += 1 if steps % 500 == 0: print("Evaluating agent") # We construct an evaluation agent based on the saved # state of the agent in training. model_path = tempfile.mktemp() agent.save(model_path) eval_agent_spec = agent_spec.replace( agent_params=dict(agent_params, model_path=model_path)) # Remove the call to ray.wait if you want evaluation to run # in parallel with training ray.wait([ evaluate.remote(eval_agent_spec, evaluation_scenarios, headless, seed) ]) env.close()
def main( scenarios, sim_name, headless, num_episodes, seed, auth_key=None, max_episode_steps=None, ): agent_spec = AgentSpec( interface=AgentInterface.from_type( AgentType.Laner, max_episode_steps=max_episode_steps), agent_builder=ChaseViaPointsAgent, ) env = gym.make( "smarts.env:hiway-v0", scenarios=scenarios, agent_specs={AGENT_ID: agent_spec}, sim_name=sim_name, headless=headless, visdom=False, timestep_sec=0.1, sumo_headless=True, seed=seed, # zoo_workers=[("143.110.210.157", 7432)], # Distribute social agents across these workers auth_key=auth_key, # envision_record_data_replay_path="./data_replay", ) for episode in episodes(n=num_episodes): agent = agent_spec.build_agent() observations = env.reset() episode.record_scenario(env.scenario_log) dones = {"__all__": False} while not dones["__all__"]: agent_obs = observations[AGENT_ID] agent_action = agent.act(agent_obs) observations, rewards, dones, infos = env.step( {AGENT_ID: agent_action}) episode.record_step(observations, rewards, dones, infos) env.close()
def main(scenarios, sim_name, headless, num_episodes, seed, max_episode_steps=None): agent_specs = { agent_id: AgentSpec( interface=AgentInterface.from_type( AgentType.Laner, max_episode_steps=max_episode_steps), agent_builder=KeepLaneAgent, ) for agent_id in AGENT_IDS } env = gym.make( "smarts.env:hiway-v0", scenarios=scenarios, agent_specs=agent_specs, sim_name=sim_name, headless=headless, seed=seed, ) for episode in episodes(n=num_episodes): agents = { agent_id: agent_spec.build_agent() for agent_id, agent_spec in agent_specs.items() } observations = env.reset() episode.record_scenario(env.scenario_log) dones = {"__all__": False} while not dones["__all__"]: actions = { agent_id: agents[agent_id].act(agent_obs) for agent_id, agent_obs in observations.items() } observations, rewards, dones, infos = env.step(actions) episode.record_step(observations, rewards, dones, infos) env.close()
def main(scenarios, sim_name, headless, num_episodes, seed, max_episode_steps=None): agent_spec = AgentSpec( interface=AgentInterface.from_type( AgentType.LanerWithSpeed, max_episode_steps=max_episode_steps), agent_builder=ChaseViaPointsAgent, ) env = gym.make( "smarts.env:hiway-v0", scenarios=scenarios, agent_specs={AGENT_ID: agent_spec}, sim_name=sim_name, headless=headless, visdom=False, fixed_timestep_sec=0.1, sumo_headless=True, seed=seed, # zoo_addrs=[("10.193.241.236", 7432)], # Sample server address (ip, port), to distribute social agents in remote server. # envision_record_data_replay_path="./data_replay", ) # Wrap a single-agent env with SingleAgent wrapper to make `step` and `reset` # output compliant with gym spaces. env = SingleAgent(env) for episode in episodes(n=num_episodes): agent = agent_spec.build_agent() observation = env.reset() episode.record_scenario(env.scenario_log) done = False while not done: agent_action = agent.act(observation) observation, reward, done, info = env.step(agent_action) episode.record_step(observation, reward, done, info) env.close()
def main(scenarios, headless, num_episodes, max_episode_steps=None): env = gym.make( "smarts.env:hiway-v0", scenarios=scenarios, agent_specs={}, headless=headless, sumo_headless=True, ) if max_episode_steps is None: max_episode_steps = 1000 for episode in episodes(n=num_episodes): env.reset() episode.record_scenario(env.scenario_log) for _ in range(max_episode_steps): env.step({}) episode.record_step({}, {}, {}, {}) env.close()
def test_env_frame_test(scenarios, seed): env, agent_spec = env_and_spec(scenarios, seed) episode_counter = 0 for episode in episodes(n=10): episode_counter += 1 agent = agent_spec.build_agent() observations = env.reset() episode.record_scenario(env.scenario_log) dones = {"__all__": False} maximum_frame_rate = 0 minimum_frame_rate = float("inf") step_counter = 0 fps_sum = 0 while not dones["__all__"]: agent_obs = observations[AGENT_ID] agent_action = agent.act(agent_obs) step_start_time = int(time.time() * 1000) observations, rewards, dones, infos = env.step( {AGENT_ID: agent_action}) step_end_time = int(time.time() * 1000) delta = step_end_time - step_start_time step_fps = round(1000 / delta, 2) maximum_frame_rate = max(maximum_frame_rate, step_fps) minimum_frame_rate = min(minimum_frame_rate, step_fps) fps_sum += step_fps test_logger.info( f"The time delta at episode {episode_counter}, step {step_counter+1} is {delta} milliseconds which is {step_fps} fps." ) episode.record_step(observations, rewards, dones, infos) step_counter += 1 avg_frame_rate = fps_sum / (step_counter or 1) test_logger.info( f"Episode {episode_counter}, Minimum fps: {minimum_frame_rate}, Maximum fps: {maximum_frame_rate}, Average fps: {avg_frame_rate}." ) assert (minimum_frame_rate >= 10) and (avg_frame_rate >= 20) env.close()
def main(scenarios, sim_name, headless, num_episodes, seed, max_episode_steps=None): agent_spec = AgentSpec( interface=AgentInterface( waypoints=True, action=ActionSpaceType.LaneWithContinuousSpeed, neighborhood_vehicles=True, rgb=True), agent_builder=perpetual_rider_agent.PerpetualRiderAgent, ) env = gym.make( "smarts.env:hiway-v0", scenarios=scenarios, agent_specs={AGENT_ID: agent_spec}, sim_name=sim_name, headless=headless, visdom=True, timestep_sec=0.1, sumo_headless=True, seed=seed, ) camera_pose = make_camera_pose() camera_intrinsic = np.array( (250.0, 0.0, FRAME_WIDTH / 2, 0.0, 250.0, FRAME_HEIGHT / 2)).reshape( (1, 2, 3)) color_rng = RandomColor(10) scene_idx = 7001 end_scene_idx = 8001 rm(f"{OUTPUT_DIR}") mkdir(f"{OUTPUT_DIR}/annotations/") mkdir(f"{OUTPUT_DIR}/ego_poses/") for episode in episodes(n=num_episodes): agent = agent_spec.build_agent() observations = env.reset() episode.record_scenario(env.scenario_log) episode_sim_time_epoch = 0 episode_sim_time_frame_with_visible_object = 0 mkdir(f"{OUTPUT_DIR}/frames/scene-{scene_idx:04d}/") dones = {"__all__": False} while not dones["__all__"]: agent_obs = observations[AGENT_ID] agent_action = agent.act(agent_obs) observations, rewards, dones, infos = env.step( {AGENT_ID: agent_action}) episode.record_step(observations, rewards, dones, infos) timestamp = episode.start_time + episode.sim_time # at most 18 seconds total # if episode.sim_time < 2.: # continue if episode.sim_time > 31.99: scene_idx += 1 break # 10 seconds for each scene if episode.sim_time - episode_sim_time_epoch > 9.99: scene_idx += 1 episode_sim_time_epoch = episode.sim_time mkdir(f"{OUTPUT_DIR}/frames/scene-{scene_idx:04d}/") # generate ego_poses ego_rot_quat = Rotation.from_euler( 'z', agent_obs.ego_vehicle_state.heading, degrees=False).as_quat().flatten() ego_translate = agent_obs.ego_vehicle_state.position.flatten() ego_pose = ', '.join([ str(x) for x in np.concatenate((ego_rot_quat, ego_translate)).tolist() ]) with open( f'{OUTPUT_DIR}/ego_poses/scene-{scene_idx:04d}_ego_pose.csv', 'a') as ego_pose_file: ego_pose_file.write(f'{timestamp}, {ego_pose}\n') # generate frame frame_ego = np.zeros((FRAME_HEIGHT, FRAME_WIDTH, 3), dtype=np.uint8) ego_vehicle_pose = np.array( (0, 0, agent_obs.ego_vehicle_state.heading, *agent_obs.ego_vehicle_state.position)) color_rng.reset() visible_object_counter = 0 for object_uid, neighborhood_vehicle_state in enumerate( agent_obs.neighborhood_vehicle_states): other_vehicle_pose = np.array( (0, 0, neighborhood_vehicle_state.heading, *neighborhood_vehicle_state.position)) other_vehicle_size = neighborhood_vehicle_state.bounding_box color = color_rng() frame_ego, xyxy = project(ego_vehicle_pose, other_vehicle_pose, camera_pose, camera_intrinsic, other_vehicle_size, frame_ego, color) # generate annotations if xyxy is not None: with open( f'{OUTPUT_DIR}/annotations/scene-{scene_idx:04d}_instances_ann.csv', 'a') as annotation_file: annotation_file.write(f"{timestamp}, {object_uid}, " + ", ".join([str(x) for x in xyxy]) + "\n") visible_object_counter += 1 # remove a scene with large blank if visible_object_counter < 1: if episode.sim_time - episode_sim_time_frame_with_visible_object > 0.5: break else: episode_sim_time_frame_with_visible_object = episode.sim_time # remove scenes less than 6 seconds if episode.sim_time - episode_sim_time_epoch < 9.99: rm(f"{OUTPUT_DIR}/frames/scene-{scene_idx:04d}/") rm(f'{OUTPUT_DIR}/annotations/scene-{scene_idx:04d}_instances_ann.csv' ) rm(f'{OUTPUT_DIR}/ego_poses/scene-{scene_idx:04d}_ego_pose.csv') time.sleep(2) if scene_idx >= end_scene_idx: break env.close()
def test(test_scenarios, sim_name, headless, num_episodes, seed): config = HyperParameters() configProto = init_tensorflow() # init env agent_spec = AgentSpec( # you can custom AgentInterface to control what obs information you need and the action type interface=cross_interface, # agent_builder=actor, # you can custom your observation adapter, reward adapter, info adapter, action adapter and so on. observation_adapter=observation_adapter, reward_adapter=reward_adapter, action_adapter=action_adapter, ) env = gym.make( "smarts.env:hiway-v0", scenarios=test_scenarios, agent_specs={AGENT_ID: agent_spec}, sim_name=sim_name, headless=headless, timestep_sec=0.1, seed=seed, ) # init nets structure if WITH_SOC_MT: model_name = "Soc_Mt_TD3Network" actor = SocMtActorNetwork(name="actor") critic_1 = SocMtCriticNetwork(name="critic_1") critic_2 = SocMtCriticNetwork(name="critic_2") else: model_name = "TD3Network" actor = ActorNetwork(name="actor") critic_1 = CriticNetwork(name="critic_1") critic_2 = CriticNetwork(name="critic_2") saver = tf.compat.v1.train.Saver() with tf.compat.v1.Session(config=configProto) as sess: # load network saver = tf.compat.v1.train.import_meta_graph("models/" + model_name + ".ckpt" + ".meta") saver.restore(sess, "models/" + model_name + ".ckpt") if saver is None: print("did not load") # init testing params test_num = 100 test_ep = 0 # results record success = 0 failure = 0 passed_case = 0 collision = 0 trouble_collision = 0 time_exceed = 0 episode_time_record = [] # start testing for episode in episodes(n=num_episodes): episode_reward = 0 env_steps = 0 # step in one episode observations = env.reset() # states of all vehs state = observations[AGENT_ID] # ego state episode.record_scenario(env.scenario_log) dones = {"__all__": False} while not dones["__all__"]: action = actor.get_action_noise(sess, state, rate=-1) observations, rewards, dones, infos = env.step( {AGENT_ID: action}) # states of all vehs in next step # ego state in next step state = observations[AGENT_ID] if WITH_SOC_MT: reward = rewards[AGENT_ID] else: reward = np.sum(rewards.values()) done = dones[AGENT_ID] info = infos[AGENT_ID] aux_info = get_aux_info(infos[AGENT_ID]["env_obs"]) episode.record_step(observations, rewards, dones, infos) if WITH_SOC_MT: episode_reward += np.sum(reward) else: episode_reward += reward env_steps += 1 if done: test_ep += 1 # record result if aux_info == "collision": collision += 1 failure += 1 elif aux_info == "trouble_collision": trouble_collision += 1 passed_case += 1 elif aux_info == "time_exceed": time_exceed += 1 failure += 1 else: # get episode time episode_time_record.append(env_steps * 0.1) success += 1 # print print( episode.index, "EPISODE ended", "TOTAL REWARD {:.4f}".format(episode_reward), "Result:", aux_info, ) print("total step of this episode: ", env_steps) episode_reward = 0 env_steps = 0 observations = env.reset() # states of all vehs state = observations[AGENT_ID] # ego state env.close() print("-*" * 15, " result ", "-*" * 15) print("success: ", success, "/", test_num) print("collision: ", collision, "/", test_num) print("time_exceed: ", time_exceed, "/", test_num) print("passed_case: ", passed_case, "/", test_num) print("average time: ", np.mean(episode_time_record))
def train( training_scenarios, sim_name, headless, num_episodes, seed, without_soc_mt, session_dir, ): WITH_SOC_MT = without_soc_mt config = HyperParameters() configProto = init_tensorflow() # init env agent_spec = AgentSpec( # you can custom AgentInterface to control what obs information you need and the action type interface=cross_interface, # agent_builder=actor, # you can custom your observation adapter, reward adapter, info adapter, action adapter and so on. observation_adapter=observation_adapter, reward_adapter=reward_adapter, action_adapter=action_adapter, ) env = gym.make( "smarts.env:hiway-v0", scenarios=training_scenarios, agent_specs={AGENT_ID: agent_spec}, sim_name=sim_name, headless=headless, timestep_sec=0.1, seed=seed, ) # init nets structure if WITH_SOC_MT: model_name = "Soc_Mt_TD3Network" actor = SocMtActorNetwork(name="actor") critic_1 = SocMtCriticNetwork(name="critic_1") critic_2 = SocMtCriticNetwork(name="critic_2") else: model_name = "TD3Network" actor = ActorNetwork(name="actor") critic_1 = CriticNetwork(name="critic_1") critic_2 = CriticNetwork(name="critic_2") # tensorflow summary for tensorboard visualization writer = tf.compat.v1.summary.FileWriter("summary") # losses tf.compat.v1.summary.scalar("Loss", critic_1.loss) tf.compat.v1.summary.scalar("Hubor_loss", critic_1.loss_2) tf.compat.v1.summary.histogram("ISWeights", critic_1.ISWeights) write_op = tf.compat.v1.summary.merge_all() saver = tf.compat.v1.train.Saver(max_to_keep=1000) # init memory buffer buffer = Buffer(config.buffer_size, config.pretrain_length) if config.load_buffer: # !!!the capacity of the buffer is limited with buffer file buffer = buffer.load_buffer(config.buffer_load_path) print("BUFFER: Buffer Loaded") else: buffer.fill_buffer(env, AGENT_ID) print("BUFFER: Buffer Filled") buffer.save_buffer(config.buffer_save_path, buffer) print("BUFFER: Buffer initialize") with tf.compat.v1.Session(config=configProto) as sess: # init nets params sess.run(tf.compat.v1.global_variables_initializer()) writer.add_graph(sess.graph) # update params of the target network actor.update_target(sess) critic_1.update_target(sess) critic_2.update_target(sess) # Reinforcement Learning loop print("Training Starts...") # experiment results recent_rewards = [] # rewards from recent 100 episodes avarage_rewards = [] # avareage reward of recent 100 episodes recent_success = [] recent_success_rate = [] EPSILON = 1 for episode in episodes(n=num_episodes): env_steps = 0 # save the model from time to time if config.model_save_frequency: if episode.index % config.model_save_frequency == 0: save_path = saver.save(sess, f"{session_dir}/{model_name}.ckpt") print("latest model saved") if episode.index % config.model_save_frequency_no_paste == 0: saver.save( sess, f"{session_dir}/{model_name}_{str(episode.index)}.ckpt", ) print("model saved") # initialize EPSILON = (config.noised_episodes - episode.index) / config.noised_episodes episode_reward = 0 observations = env.reset() # states of all vehs state = observations[AGENT_ID] # ego state episode.record_scenario(env.scenario_log) dones = {"__all__": False} while not dones["__all__"]: action_noise = actor.get_action_noise(sess, state, rate=EPSILON) observations, rewards, dones, infos = env.step( {AGENT_ID: action_noise}) # states of all vehs in next step # ego state in next step next_state = observations[AGENT_ID] if WITH_SOC_MT: reward = rewards[AGENT_ID] else: reward = np.sum(rewards.values()) done = dones[AGENT_ID] info = infos[AGENT_ID] aux_info = get_aux_info(infos[AGENT_ID]["env_obs"]) episode.record_step(observations, rewards, dones, infos) if WITH_SOC_MT: episode_reward += np.sum(reward) else: episode_reward += reward # store the experience experience = state, action_noise, reward, next_state, done # print(state) buffer.store(experience) ## Model training STARTS if env_steps % config.train_frequency == 0: # "Delayed" Policy Updates policy_delayed = 2 for _ in range(policy_delayed): # First we need a mini-batch with experiences (s, a, r, s', done) tree_idx, batch, ISWeights_mb = buffer.sample( config.batch_size) s_mb, a_mb, r_mb, next_s_mb, dones_mb = get_split_batch( batch) task_mb = s_mb[:, -config.task_size:] next_task_mb = next_s_mb[:, -config.task_size:] # Get q_target values for next_state from the critic_target if WITH_SOC_MT: a_target_next_state = actor.get_action_target( sess, next_s_mb) # with Target Policy Smoothing q_target_next_state_1 = critic_1.get_q_value_target( sess, next_s_mb, a_target_next_state) q_target_next_state_1 = (q_target_next_state_1 * next_task_mb ) # multi task q value q_target_next_state_2 = critic_2.get_q_value_target( sess, next_s_mb, a_target_next_state) q_target_next_state_2 = (q_target_next_state_2 * next_task_mb ) # multi task q value q_target_next_state = np.minimum( q_target_next_state_1, q_target_next_state_2) else: a_target_next_state = actor.get_action_target( sess, next_s_mb) # with Target Policy Smoothing q_target_next_state_1 = critic_1.get_q_value_target( sess, next_s_mb, a_target_next_state) q_target_next_state_2 = critic_2.get_q_value_target( sess, next_s_mb, a_target_next_state) q_target_next_state = np.minimum( q_target_next_state_1, q_target_next_state_2) # Set Q_target = r if the episode ends at s+1, otherwise Q_target = r + gamma * Qtarget(s',a') target_Qs_batch = [] for i in range(0, len(dones_mb)): terminal = dones_mb[i] # if we are in a terminal state. only equals reward if terminal: target_Qs_batch.append((r_mb[i] * task_mb[i])) else: # take the Q taregt for action a' target = ( r_mb[i] * task_mb[i] + config.gamma * q_target_next_state[i]) target_Qs_batch.append(target) targets_mb = np.array( [each for each in target_Qs_batch]) # critic train if len(a_mb.shape) > 2: a_mb = np.squeeze(a_mb, axis=1) loss, absolute_errors = critic_1.train( sess, s_mb, a_mb, targets_mb, ISWeights_mb) loss_2, absolute_errors_2 = critic_2.train( sess, s_mb, a_mb, targets_mb, ISWeights_mb) # actor train a_for_grad = actor.get_action(sess, s_mb) a_gradients = critic_1.get_gradients( sess, s_mb, a_for_grad) # print(a_gradients) actor.train(sess, s_mb, a_gradients[0]) # target train actor.update_target(sess) critic_1.update_target(sess) critic_2.update_target(sess) # update replay memory priorities if WITH_SOC_MT: absolute_errors = np.sum(absolute_errors, axis=1) buffer.batch_update(tree_idx, absolute_errors) ## Model training ENDS if done: # visualize reward data recent_rewards.append(episode_reward) if len(recent_rewards) > 100: recent_rewards.pop(0) avarage_rewards.append(np.mean(recent_rewards)) avarage_rewards_data = np.array(avarage_rewards) d = {"avarage_rewards": avarage_rewards_data} with open(os.path.join("results", "reward_data" + ".pkl"), "wb") as f: pickle.dump(d, f, pickle.HIGHEST_PROTOCOL) # visualize success rate data if aux_info == "success": recent_success.append(1) else: recent_success.append(0) if len(recent_success) > 100: recent_success.pop(0) avarage_success_rate = recent_success.count(1) / len( recent_success) recent_success_rate.append(avarage_success_rate) recent_success_rate_data = np.array(recent_success_rate) d = {"recent_success_rates": recent_success_rate_data} with open( os.path.join("results", "success_rate_data" + ".pkl"), "wb") as f: pickle.dump(d, f, pickle.HIGHEST_PROTOCOL) # print results on the terminal print("Episode total reward:", episode_reward) print("Episode time:", env_steps * 0.1) print("Success rate:", avarage_success_rate) print(episode.index, "episode finished.") buffer.measure_utilization() print("---" * 15) break else: state = next_state env_steps += 1 env.close()
def get_total_actions(self): """ Returns the total number of actions an agent could ever take """ return self.n_actions def get_stats(self): return None def render(self): raise NotImplementedError def close(self): pass def seed(self): raise NotImplementedError if __name__ == "__main__": env = SMARTSEnv() base_nev = env.base_env for episode in episodes(n=100): observations = env.reset() episode.record_scenario(env.base_env.scenario_log) dones = {"__all__": False} while not np.all(dones.values()): observations, rewards, dones, infos = env.step([0, 1]) episode.record_step(observations, rewards, dones, infos)
type=str, nargs="+", ) parser.add_argument("--headless", help="run simulation in headless mode", action="store_true") args = parser.parse_args() data_replay_path = ( f"./{args.replay_data}/{args.scenarios[0].split('/')[-1]}/data_replay") env = gym.make( "smarts.env:hiway-v0", scenarios=args.scenarios, agent_specs={}, headless=args.headless, visdom=False, fixed_timestep_sec=0.1, endless_traffic=False, envision_record_data_replay_path=data_replay_path, ) for episode in episodes(n=1): env.reset() episode.record_scenario(env.scenario_log) for _ in range(600): env.step({}) episode.record_step({}, {}, {}, {}) env.close()
def main(scenarios, sim_name, headless, seed, speed, max_steps, save_dir, write): from zoo import policies if not os.path.exists(save_dir): os.makedirs(save_dir) policies.replay_save_dir = save_dir policies.replay_read = not write # This is how you can wrap an agent in replay-agent-v0 wrapper to store and load its inputs and actions # and replay it agent_spec = zoo_make( "zoo.policies:replay-agent-v0", save_directory=save_dir, id="agent_007", wrapped_agent_locator="zoo.policies:keep-left-with-speed-agent-v0", wrapped_agent_params={"speed": speed}, ) # copy the scenarios to the replay directory to make sure it's not changed copy_scenarios(save_dir, scenarios) env = gym.make( "smarts.env:hiway-v0", scenarios=scenarios, agent_specs={AGENT_ID: agent_spec}, sim_name=sim_name, headless=headless, visdom=False, timestep_sec=0.1, sumo_headless=True, seed=seed, ) # Carry out the experiment episode = next(episodes(n=1)) agent = agent_spec.build_agent() observations = env.reset() dones = {"__all__": False} MAX_STEPS = 2550 i = 0 try: while not dones["__all__"] and i < max_steps: agent_obs = observations[AGENT_ID] agent_action = agent.act(agent_obs) observations, rewards, dones, infos = env.step({AGENT_ID: agent_action}) i += 1 if i % 10 == 0: print("Step: ", i) episode.record_step(observations, rewards, dones, infos) except KeyboardInterrupt: # discard result i = MAX_STEPS finally: if dones["__all__"]: i = MAX_STEPS try: episode.record_scenario(env.scenario_log) env.close() finally: sys.exit(i // 10)