Ejemplo n.º 1
0
def experiment(variant):
    import sys
    from traffic.make_env import make_env
    expl_env = make_env(args.exp_name,**variant['env_kwargs'])
    eval_env = make_env(args.exp_name,**variant['env_kwargs'])
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n

    from graph_builder_multi import MultiTrafficGraphBuilder
    gb = MultiTrafficGraphBuilder(input_dim=4, node_num=expl_env.max_veh_num+1,
                            ego_init=torch.tensor([0.,1.]),
                            other_init=torch.tensor([1.,0.]),
                            )
    from gnn_net import GNNNet
    gnn = GNNNet( 
                pre_graph_builder = gb, 
                node_dim = 16,
                num_conv_layers=3)

    from layers import SelectLayer
    encoders = []
    encoders.append(nn.Sequential(gnn,SelectLayer(1,0),nn.ReLU()))
    sup_learners = []
    for i in range(expl_env.max_veh_num):
        sup_learner = nn.Sequential(
                gnn,
                SelectLayer(1,i+1),
                nn.ReLU(),
                nn.Linear(16, 2),
                )
        sup_learner = SoftmaxPolicy(sup_learner, learn_temperature=False)
        sup_learners.append(sup_learner)
        encoders.append(sup_learner)

    decoder = Mlp(input_size=int(16+2*expl_env.max_veh_num),
              output_size=action_dim,
              hidden_sizes=[],
            )
    from layers import ConcatLayer
    need_gradients = np.array([True]*len(encoders))
    if variant['no_gradient']:
        need_gradients[1:] = False
    policy = nn.Sequential(
            ConcatLayer(encoders, need_gradients=list(need_gradients), dim=1),
            decoder,
            )
    policy = SoftmaxPolicy(policy, learn_temperature=False)

    vf = Mlp(
        hidden_sizes=[32, 32],
        input_size=obs_dim,
        output_size=1,
    )
    vf_criterion = nn.MSELoss()
    eval_policy = ArgmaxDiscretePolicy(policy,use_preactivation=True)
    expl_policy = policy

    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )
    from sup_replay_buffer import SupReplayBuffer
    replay_buffer = SupReplayBuffer(
        observation_dim = obs_dim,
        label_dims = [1]*expl_env.max_veh_num,
        max_replay_buffer_size = int(1e6),
    )

    from rlkit.torch.vpg.ppo_sup import PPOSupTrainer
    trainer = PPOSupTrainer(
        policy=policy,
        value_function=vf,
        vf_criterion=vf_criterion,
        sup_learners=sup_learners,
        replay_buffer=replay_buffer,
        **variant['trainer_kwargs']
    )
    algorithm = TorchOnlineRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        log_path_function = get_traffic_path_information,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Ejemplo n.º 2
0
    observe_mode=args.obs,
    label_mode=args.label,
    yld=args.yld,
    driver_sigma=args.ds,
)
from traffic.make_env import make_env
env = make_env(args.exp_name, **env_kwargs)
obs_dim = env.observation_space.low.size
action_dim = env.action_space.n
label_num = env.label_num
label_dim = env.label_dim

from graph_builder_multi import MultiTrafficGraphBuilder
gb = MultiTrafficGraphBuilder(
    input_dim=4,
    node_num=env.max_veh_num + 1,
    ego_init=torch.tensor([0., 1.]),
    other_init=torch.tensor([1., 0.]),
)

obs = env.reset()
obs_batch = torch.tensor([obs])
valid_mask = gb.get_valid_node_mask(obs_batch)
print('valid_mask: ', valid_mask)

x, edge_index = gb(obs_batch)
print('x: ', x)
print('edge_index: ', edge_index)

data = Data(x=x, edge_index=edge_index)
ng = pyg_utils.to_networkx(data)
networkx.draw_planar(ng)
Ejemplo n.º 3
0
def experiment(variant):
    from traffic.make_env import make_env
    expl_env = make_env(args.exp_name, **variant['env_kwargs'])
    eval_env = make_env(args.exp_name, **variant['env_kwargs'])
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n
    label_num = expl_env.label_num
    label_dim = expl_env.label_dim

    from graph_builder_multi import MultiTrafficGraphBuilder
    policy_gb = MultiTrafficGraphBuilder(
        input_dim=4 + label_dim,
        node_num=expl_env.max_veh_num + 1,
        ego_init=torch.tensor([0., 1.]),
        other_init=torch.tensor([1., 0.]),
    )
    if variant['gnn_kwargs']['attention']:
        from gnn_attention_net import GNNAttentionNet
        gnn_class = GNNAttentionNet
    else:
        from gnn_net import GNNNet
        gnn_class = GNNNet
    policy_gnn = gnn_class(
        pre_graph_builder=policy_gb,
        node_dim=variant['gnn_kwargs']['node'],
        num_conv_layers=variant['gnn_kwargs']['layer'],
        hidden_activation=variant['gnn_kwargs']['activation'],
    )
    from layers import FlattenLayer, SelectLayer
    policy = nn.Sequential(
        policy_gnn, SelectLayer(1, 0), FlattenLayer(), nn.ReLU(),
        nn.Linear(variant['gnn_kwargs']['node'], action_dim))

    sup_gb = MultiTrafficGraphBuilder(
        input_dim=4,
        node_num=expl_env.max_veh_num + 1,
        ego_init=torch.tensor([0., 1.]),
        other_init=torch.tensor([1., 0.]),
    )
    sup_attentioner = None
    from layers import ReshapeLayer
    from gnn_net import GNNNet
    sup_gnn = GNNNet(
        pre_graph_builder=sup_gb,
        node_dim=variant['gnn_kwargs']['node'],
        num_conv_layers=variant['gnn_kwargs']['layer'],
        hidden_activation=variant['gnn_kwargs']['activation'],
    )
    sup_learner = nn.Sequential(
        sup_gnn,
        SelectLayer(1, np.arange(1, expl_env.max_veh_num + 1)),
        nn.ReLU(),
        nn.Linear(variant['gnn_kwargs']['node'], label_dim),
    )
    from sup_sep_softmax_policy import SupSepSoftmaxPolicy
    policy = SupSepSoftmaxPolicy(policy, sup_learner, label_num, label_dim)
    print('parameters: ',
          np.sum([p.view(-1).shape[0] for p in policy.parameters()]))

    vf = Mlp(
        hidden_sizes=[32, 32],
        input_size=obs_dim,
        output_size=1,
    )
    vf_criterion = nn.MSELoss()
    eval_policy = ArgmaxDiscretePolicy(policy, use_preactivation=True)
    expl_policy = policy

    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    from sup_sep_rollout import sup_sep_rollout
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
        rollout_fn=sup_sep_rollout,
    )
    from sup_replay_buffer import SupReplayBuffer
    replay_buffer = SupReplayBuffer(
        observation_dim=obs_dim,
        label_dim=label_num,
        max_replay_buffer_size=int(1e6),
    )

    from rlkit.torch.vpg.trpo_sup_sep import TRPOSupSepTrainer
    trainer = TRPOSupSepTrainer(policy=policy,
                                value_function=vf,
                                vf_criterion=vf_criterion,
                                replay_buffer=replay_buffer,
                                **variant['trainer_kwargs'])
    algorithm = TorchOnlineRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        log_path_function=get_traffic_path_information,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Ejemplo n.º 4
0
def experiment(variant):
    from traffic.make_env import make_env
    expl_env = make_env(args.exp_name,**variant['env_kwargs'])
    eval_env = make_env(args.exp_name,**variant['env_kwargs'])
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n
    label_num = expl_env.label_num
    label_dim = expl_env.label_dim

    if variant['load_kwargs']['load']:
        load_dir = variant['load_kwargs']['load_dir']
        load_data = torch.load(load_dir+'/params.pkl',map_location='cpu')
        policy = load_data['trainer/policy']
        vf = load_data['trainer/value_function']
    else:
        from graph_builder_multi import MultiTrafficGraphBuilder
        gb = MultiTrafficGraphBuilder(input_dim=4, node_num=expl_env.max_veh_num+1,
                                ego_init=torch.tensor([0.,1.]),
                                other_init=torch.tensor([1.,0.]),
                                )
        if variant['gnn_kwargs']['attention']:
            from gnn_attention_net import GNNAttentionNet
            gnn_class = GNNAttentionNet
        else:
            from gnn_net import GNNNet
            gnn_class = GNNNet
        gnn = gnn_class( 
                    pre_graph_builder=gb, 
                    node_dim=variant['gnn_kwargs']['node'],
                    conv_type=variant['gnn_kwargs']['conv_type'],
                    num_conv_layers=variant['gnn_kwargs']['layer'],
                    hidden_activation=variant['gnn_kwargs']['activation'],
                    )
        from layers import FlattenLayer, SelectLayer
        policy = nn.Sequential(
                    gnn,
                    SelectLayer(1,0),
                    FlattenLayer(),
                    nn.ReLU(),
                    nn.Linear(variant['gnn_kwargs']['node'],action_dim)
                )
        policy = SoftmaxPolicy(policy, learn_temperature=False)
        print('parameters: ',np.sum([p.view(-1).shape[0] for p in policy.parameters()]))

        vf = Mlp(
            hidden_sizes=[32, 32],
            input_size=obs_dim,
            output_size=1,
        )
        
    vf_criterion = nn.MSELoss()
    eval_policy = ArgmaxDiscretePolicy(policy,use_preactivation=True)
    expl_policy = policy

    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )
    trainer = PPOTrainer(
        policy=policy,
        value_function=vf,
        vf_criterion=vf_criterion,
        **variant['trainer_kwargs']
    )
    algorithm = TorchOnlineRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        log_path_function = get_traffic_path_information,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()