def main(args): config_path = args.config_path if config_path is None: config_path = utils.select_run() if config_path is None: return print(config_path) cfg = utils.load_config(config_path) # Create env if args.real: real_robot_indices = list(map(int, args.real_robot_indices.split(','))) real_cube_indices = list(map(int, args.real_cube_indices.split(','))) env = utils.get_env_from_cfg(cfg, real=True, real_robot_indices=real_robot_indices, real_cube_indices=real_cube_indices) else: env = utils.get_env_from_cfg(cfg, show_gui=True) # Create policy policy = utils.get_policy_from_cfg(cfg, env.get_robot_group_types()) # Run policy state = env.reset() try: while True: action = policy.step(state) state, _, done, _ = env.step(action) if done: state = env.reset() finally: env.close()
def main(args): config_path = args.config_path if config_path is None: config_path = utils.select_run() if config_path is None: print('Please provide a config path') return cfg = utils.read_config(config_path) env = utils.get_env_from_cfg(cfg, use_gui=True) policy = utils.get_policy_from_cfg(cfg, env.get_action_space()) state = env.reset() while True: action, _ = policy.step(state) state, _, done, _ = env.step(action) if done: state = env.reset()
def main(args): config_path = args.config_path if config_path is None: config_path = utils.select_run() if config_path is None: return print(config_path) cfg = utils.load_config(config_path) # Create env env = utils.get_env_from_cfg(cfg, show_gui=True) tf_env = components.get_tf_py_env(env, cfg.num_input_channels) # Load policies policies = components.load_policies(cfg) # Run policies time_step = tf_env.reset() while True: robot_group_index = tf_env.pyenv.envs[0].current_robot_group_index() action_step = policies[robot_group_index].action(time_step) time_step = tf_env.step(action_step.action)
def run_eval(cfg, num_episodes=20): random_seed = 0 # Create env env = utils.get_env_from_cfg(cfg, random_seed=random_seed, use_egl_renderer=False) # Create policy policy = utils.get_policy_from_cfg(cfg, env.get_robot_group_types(), random_seed=random_seed) # Run policy data = [[] for _ in range(num_episodes)] episode_count = 0 state = env.reset() while True: action = policy.step(state) state, _, done, info = env.step(action) data[episode_count].append({ 'simulation_steps': info['simulation_steps'], 'cubes': info['total_cubes'], 'robot_collisions': info['total_robot_collisions'], }) if done: episode_count += 1 print('Completed {}/{} episodes'.format(episode_count, num_episodes)) if episode_count >= num_episodes: break state = env.reset() env.close() return data
def _run_eval(cfg, num_episodes=20): env = utils.get_env_from_cfg(cfg, random_seed=0) policy = utils.get_policy_from_cfg(cfg, env.get_action_space(), random_seed=0) data = [[] for _ in range(num_episodes)] episode_count = 0 state = env.reset() while True: action, _ = policy.step(state) state, _, done, info = env.step(action) data[episode_count].append({ 'distance': info['cumulative_distance'], 'cubes': info['cumulative_cubes'] }) if done: state = env.reset() episode_count += 1 print('Completed {}/{} episodes'.format(episode_count, num_episodes)) if episode_count >= num_episodes: break return data
def run_eval(cfg, num_episodes=20): random_seed = 0 # Create env env = utils.get_env_from_cfg(cfg, random_seed=random_seed, use_egl_renderer=False) tf_env = components.get_tf_py_env(env, cfg.num_input_channels) # Load policies policies = components.load_policies(cfg) # Run policies data = [[] for _ in range(num_episodes)] episode_count = 0 time_step = tf_env.reset() while True: robot_group_index = tf_env.pyenv.envs[0].current_robot_group_index() action_step = policies[robot_group_index].action(time_step) time_step = tf_env.step(action_step.action) info = tf_env.pyenv.envs[0].get_info() data[episode_count].append({ 'simulation_steps': info['simulation_steps'], 'cubes': info['total_cubes'], }) if tf_env.pyenv.envs[0].done(): episode_count += 1 print('Completed {}/{} episodes'.format(episode_count, num_episodes)) if episode_count >= num_episodes: break time_step = tf_env.reset() env.close() return data
def main(cfg): # Set up logging and checkpointing log_dir = Path(cfg.log_dir) checkpoint_dir = Path(cfg.checkpoint_dir) print('log_dir: {}'.format(log_dir)) print('checkpoint_dir: {}'.format(checkpoint_dir)) # Create environment kwargs = {} if cfg.show_gui: import matplotlib # pylint: disable=import-outside-toplevel matplotlib.use('agg') if cfg.use_predicted_intention: # Enable ground truth intention map during training only kwargs['use_intention_map'] = True kwargs['intention_map_encoding'] = 'ramp' env = utils.get_env_from_cfg(cfg, **kwargs) robot_group_types = env.get_robot_group_types() num_robot_groups = len(robot_group_types) # Policy policy = utils.get_policy_from_cfg(cfg, robot_group_types, train=True) # Optimizers optimizers = [] for i in range(num_robot_groups): optimizers.append( optim.SGD(policy.policy_nets[i].parameters(), lr=cfg.learning_rate, momentum=0.9, weight_decay=cfg.weight_decay)) if cfg.use_predicted_intention: optimizers_intention = [] for i in range(num_robot_groups): optimizers_intention.append( optim.SGD(policy.intention_nets[i].parameters(), lr=cfg.learning_rate, momentum=0.9, weight_decay=cfg.weight_decay)) # Replay buffers replay_buffers = [] for _ in range(num_robot_groups): replay_buffers.append(ReplayBuffer(cfg.replay_buffer_size)) # Resume if applicable start_timestep = 0 episode = 0 if cfg.checkpoint_path is not None: checkpoint = torch.load(cfg.checkpoint_path) start_timestep = checkpoint['timestep'] episode = checkpoint['episode'] for i in range(num_robot_groups): optimizers[i].load_state_dict(checkpoint['optimizers'][i]) replay_buffers[i] = checkpoint['replay_buffers'][i] if cfg.use_predicted_intention: for i in range(num_robot_groups): optimizers_intention[i].load_state_dict( checkpoint['optimizers_intention'][i]) print("=> loaded checkpoint '{}' (timestep {})".format( cfg.checkpoint_path, start_timestep)) # Target nets target_nets = policy.build_policy_nets() for i in range(num_robot_groups): target_nets[i].load_state_dict(policy.policy_nets[i].state_dict()) target_nets[i].eval() # Logging train_summary_writer = SummaryWriter(log_dir=str(log_dir / 'train')) visualization_summary_writer = SummaryWriter(log_dir=str(log_dir / 'visualization')) meters = Meters() state = env.reset() transition_tracker = TransitionTracker(state) learning_starts = np.round(cfg.learning_starts_frac * cfg.total_timesteps).astype(np.uint32) total_timesteps_with_warm_up = learning_starts + cfg.total_timesteps for timestep in tqdm(range(start_timestep, total_timesteps_with_warm_up), initial=start_timestep, total=total_timesteps_with_warm_up, file=sys.stdout): # Select an action for each robot exploration_eps = 1 - (1 - cfg.final_exploration) * min( 1, max(0, timestep - learning_starts) / (cfg.exploration_frac * cfg.total_timesteps)) if cfg.use_predicted_intention: use_ground_truth_intention = max( 0, timestep - learning_starts ) / cfg.total_timesteps <= cfg.use_predicted_intention_frac action = policy.step( state, exploration_eps=exploration_eps, use_ground_truth_intention=use_ground_truth_intention) else: action = policy.step(state, exploration_eps=exploration_eps) transition_tracker.update_action(action) # Step the simulation state, reward, done, info = env.step(action) # Store in buffers transitions_per_buffer = transition_tracker.update_step_completed( reward, state, done) for i, transitions in enumerate(transitions_per_buffer): for transition in transitions: replay_buffers[i].push(*transition) # Reset if episode ended if done: state = env.reset() transition_tracker = TransitionTracker(state) episode += 1 # Train networks if timestep >= learning_starts and (timestep + 1) % cfg.train_freq == 0: all_train_info = {} for i in range(num_robot_groups): batch = replay_buffers[i].sample(cfg.batch_size) train_info = train(cfg, policy.policy_nets[i], target_nets[i], optimizers[i], batch, policy.apply_transform, cfg.discount_factors[i]) if cfg.use_predicted_intention: train_info_intention = train_intention( policy.intention_nets[i], optimizers_intention[i], batch, policy.apply_transform) train_info.update(train_info_intention) for name, val in train_info.items(): all_train_info['{}/robot_group_{:02}'.format(name, i + 1)] = val # Update target networks if (timestep + 1) % cfg.target_update_freq == 0: for i in range(num_robot_groups): target_nets[i].load_state_dict( policy.policy_nets[i].state_dict()) ################################################################################ # Logging # Meters if timestep >= learning_starts and (timestep + 1) % cfg.train_freq == 0: for name, val in all_train_info.items(): meters.update(name, val) if done: for name in meters.get_names(): train_summary_writer.add_scalar(name, meters.avg(name), timestep + 1) meters.reset() train_summary_writer.add_scalar('steps', info['steps'], timestep + 1) train_summary_writer.add_scalar('total_cubes', info['total_cubes'], timestep + 1) train_summary_writer.add_scalar('episodes', episode, timestep + 1) for i in range(num_robot_groups): for name in [ 'cumulative_cubes', 'cumulative_distance', 'cumulative_reward', 'cumulative_robot_collisions' ]: train_summary_writer.add_scalar( '{}/robot_group_{:02}'.format(name, i + 1), np.mean(info[name][i]), timestep + 1) # Visualize Q-network outputs if timestep >= learning_starts: random_state = [[ random.choice(replay_buffers[i].buffer).state ] for _ in range(num_robot_groups)] _, info = policy.step(random_state, debug=True) for i in range(num_robot_groups): visualization = utils.get_state_output_visualization( random_state[i][0], info['output'][i][0]).transpose( (2, 0, 1)) visualization_summary_writer.add_image( 'output/robot_group_{:02}'.format(i + 1), visualization, timestep + 1) if cfg.use_predicted_intention: visualization_intention = utils.get_state_output_visualization( random_state[i][0], np.stack((random_state[i][0][:, :, -1], info['output_intention'][i][0]), axis=0) # Ground truth and output ).transpose((2, 0, 1)) visualization_summary_writer.add_image( 'output_intention/robot_group_{:02}'.format(i + 1), visualization_intention, timestep + 1) ################################################################################ # Checkpointing if ( timestep + 1 ) % cfg.checkpoint_freq == 0 or timestep + 1 == total_timesteps_with_warm_up: if not checkpoint_dir.exists(): checkpoint_dir.mkdir(parents=True, exist_ok=True) # Save policy policy_filename = 'policy_{:08d}.pth.tar'.format(timestep + 1) policy_path = checkpoint_dir / policy_filename policy_checkpoint = { 'timestep': timestep + 1, 'state_dicts': [ policy.policy_nets[i].state_dict() for i in range(num_robot_groups) ], } if cfg.use_predicted_intention: policy_checkpoint['state_dicts_intention'] = [ policy.intention_nets[i].state_dict() for i in range(num_robot_groups) ] torch.save(policy_checkpoint, str(policy_path)) # Save checkpoint checkpoint_filename = 'checkpoint_{:08d}.pth.tar'.format(timestep + 1) checkpoint_path = checkpoint_dir / checkpoint_filename checkpoint = { 'timestep': timestep + 1, 'episode': episode, 'optimizers': [optimizers[i].state_dict() for i in range(num_robot_groups)], 'replay_buffers': [replay_buffers[i] for i in range(num_robot_groups)], } if cfg.use_predicted_intention: checkpoint['optimizers_intention'] = [ optimizers_intention[i].state_dict() for i in range(num_robot_groups) ] torch.save(checkpoint, str(checkpoint_path)) # Save updated config file cfg.policy_path = str(policy_path) cfg.checkpoint_path = str(checkpoint_path) utils.save_config(log_dir / 'config.yml', cfg) # Remove old checkpoint checkpoint_paths = list( checkpoint_dir.glob('checkpoint_*.pth.tar')) checkpoint_paths.remove(checkpoint_path) for old_checkpoint_path in checkpoint_paths: old_checkpoint_path.unlink() env.close()
def main(args): # Connect to aruco server for pose estimates try: conn = Client(('localhost', 6000), authkey=b'secret password') except ConnectionRefusedError: print('Could not connect to aruco server for pose estimates') return # Create action executor for the physical robot action_executor = vector_action_executor.VectorActionExecutor( args.robot_index) # Create env config_path = args.config_path if config_path is None: config_path = utils.select_run() if config_path is None: print('Please provide a config path') return cfg = utils.read_config(config_path) kwargs = {'num_cubes': args.num_cubes} if args.debug: kwargs['use_gui'] = True cube_indices = list(range(args.num_cubes)) env = utils.get_env_from_cfg(cfg, physical_env=True, robot_index=action_executor.robot_index, cube_indices=cube_indices, **kwargs) env.reset() # Create policy policy = utils.get_policy_from_cfg(cfg, env.get_action_space()) # Debug visualization if args.debug: cv2.namedWindow('out', cv2.WINDOW_NORMAL) #cv2.resizeWindow('out', 960, 480) try: while True: # Get new pose estimates poses = None while conn.poll(): # ensure up-to-date data poses = conn.recv() if poses is None: continue # Update poses in the simulation env.update_poses(poses) # Get new action state = env.get_state() if action_executor.is_action_completed() and args.debug: action, info = policy.step(state, debug=True) # Visualize assert not cfg.use_steering_commands output = info['output'].cpu().numpy() cv2.imshow( 'out', utils.get_state_and_output_visualization( state, output)[:, :, ::-1]) cv2.waitKey(1) else: action, _ = policy.step(state) # Run selected action through simulation try_action_result = env.try_action(action) if action_executor.is_action_completed(): # Update action executor action_executor.update_try_action_result(try_action_result) # Run action executor action_executor.step(poses) finally: action_executor.disconnect()
def main(cfg): # Set up logging and checkpointing log_dir = Path(cfg.log_dir) checkpoint_dir = Path(cfg.checkpoint_dir) print('log_dir: {}'.format(log_dir)) print('checkpoint_dir: {}'.format(checkpoint_dir)) # Create env env = utils.get_env_from_cfg(cfg) tf_env = components.get_tf_py_env(env, cfg.num_input_channels) # Agents epsilon = tf.Variable(1.0) agents = [] for i, g in enumerate(cfg.robot_config): robot_type = next(iter(g)) q_net = components.QNetwork( tf_env.observation_spec(), num_output_channels=VectorEnv.get_num_output_channels(robot_type)) optimizer = keras.optimizers.SGD( learning_rate=cfg.learning_rate, momentum=0.9) # cfg.weight_decay is currently ignored agent_cls = dqn_agent.DdqnAgent if cfg.use_double_dqn else dqn_agent.DqnAgent agent = agent_cls( time_step_spec=tf_env.time_step_spec(), action_spec=components.get_action_spec(robot_type), q_network=q_net, optimizer=optimizer, epsilon_greedy=epsilon, target_update_period=(cfg.target_update_freq // cfg.train_freq), td_errors_loss_fn=common.element_wise_huber_loss, gamma=cfg.discount_factors[i], gradient_clipping=cfg.grad_norm_clipping, train_step_counter=tf.Variable( 0, dtype=tf.int64), # Separate counter for each agent ) agent.initialize() agent.train = common.function(agent.train) agents.append(agent) global_step = agents[0].train_step_counter # Replay buffers replay_buffers = [ReplayBuffer(cfg.replay_buffer_size) for _ in agents] # Checkpointing timestep_var = tf.Variable(0, dtype=tf.int64) agent_checkpointer = common.Checkpointer(ckpt_dir=str(checkpoint_dir / 'agents'), max_to_keep=5, agents=agents, timestep_var=timestep_var) agent_checkpointer.initialize_or_restore() if timestep_var.numpy() > 0: checkpoint_path = checkpoint_dir / 'checkpoint_{:08d}.pkl'.format( timestep_var.numpy()) with open(checkpoint_path, 'rb') as f: replay_buffers = pickle.load(f) # Logging train_summary_writer = tf.summary.create_file_writer(str(log_dir / 'train')) train_summary_writer.set_as_default() time_step = tf_env.reset() learning_starts = round(cfg.learning_starts_frac * cfg.total_timesteps) total_timesteps_with_warm_up = learning_starts + cfg.total_timesteps start_timestep = timestep_var.numpy() for timestep in tqdm(range(start_timestep, total_timesteps_with_warm_up), initial=start_timestep, total=total_timesteps_with_warm_up, file=sys.stdout): # Set exploration epsilon exploration_eps = 1 - (1 - cfg.final_exploration) * min( 1, max(0, timestep - learning_starts) / (cfg.exploration_frac * cfg.total_timesteps)) epsilon.assign(exploration_eps) # Run one collect step transitions_per_buffer = tf_env.pyenv.envs[0].store_time_step( time_step) robot_group_index = tf_env.pyenv.envs[0].current_robot_group_index() action_step = agents[robot_group_index].collect_policy.action( time_step) time_step = tf_env.step(action_step.action) # Store experience in buffers for i, transitions in enumerate(transitions_per_buffer): for transition in transitions: replay_buffers[i].push(*transition) # Train policies if timestep >= learning_starts and (timestep + 1) % cfg.train_freq == 0: for i, agent in enumerate(agents): experience = replay_buffers[i].sample(cfg.batch_size) agent.train(experience) # Logging if tf_env.pyenv.envs[0].done(): info = tf_env.pyenv.envs[0].get_info() tf.summary.scalar('timesteps', timestep + 1, global_step) tf.summary.scalar('steps', info['steps'], global_step) tf.summary.scalar('total_cubes', info['total_cubes'], global_step) # Checkpointing if ( timestep + 1 ) % cfg.checkpoint_freq == 0 or timestep + 1 == total_timesteps_with_warm_up: # Save agents timestep_var.assign(timestep + 1) agent_checkpointer.save(timestep + 1) # Save replay buffers checkpoint_path = checkpoint_dir / 'checkpoint_{:08d}.pkl'.format( timestep + 1) with open(checkpoint_path, 'wb') as f: pickle.dump(replay_buffers, f) cfg.checkpoint_path = str(checkpoint_path) utils.save_config(log_dir / 'config.yml', cfg) # Remove old checkpoints checkpoint_paths = list(checkpoint_dir.glob('checkpoint_*.pkl')) checkpoint_paths.remove(checkpoint_path) for old_checkpoint_path in checkpoint_paths: old_checkpoint_path.unlink() # Export trained policies policy_dir = checkpoint_dir / 'policies' for i, agent in enumerate(agents): policy_saver.PolicySaver(agent.policy).save( str(policy_dir / 'robot_group_{:02}'.format(i + 1))) cfg.policy_path = str(policy_dir) utils.save_config(log_dir / 'config.yml', cfg) env.close()