def __init__( self, hidden_sizes, obs_dim, action_dim, init_w=3e-3, hidden_activation=F.relu, output_activation=identity, hidden_init=ptu.fanin_init, b_init_value=0.1, layer_norm=False, layer_norm_kwargs=None, ): super().__init__() self.fc1 = Mlp( input_size=obs_dim + action_dim, hidden_sizes=[], output_size=hidden_sizes[0], output_activation=hidden_activation, layer_norm=layer_norm, ) self.fc2 = Mlp( input_size=action_dim + hidden_sizes[0], hidden_sizes=hidden_sizes[1:], output_size=1, output_activation=output_activation, layer_norm=layer_norm, )
def get_network(network_args, obs_dim, action_dim): if (network_args["type"] == "conv_mixed"): from surprise.envs.vizdoom.networks import VizdoomQF qf = VizdoomQF(actions=action_dim, **network_args) target_qf = VizdoomQF(actions=action_dim, **network_args) elif (network_args["type"] == "conv"): from surprise.envs.vizdoom.networks import VizdoomFeaturizer print ("Using conv") qf = VizdoomFeaturizer(dim=action_dim, **network_args) target_qf = VizdoomFeaturizer(dim=action_dim, **network_args) else: from rlkit.torch.networks import Mlp qf = Mlp( hidden_sizes=[128, 64, 32], input_size=obs_dim[0], output_size=action_dim, ) target_qf = Mlp( hidden_sizes=[128, 64, 32], input_size=obs_dim[0], output_size=action_dim, ) return (qf, target_qf)
def __init__( self, input_dim, output_dim, latent_dims, encode_mlp_kwargs, decode_mlp_kwargs, no_gradient, ): super(MLPAutoEncoder, self).__init__() self.no_gradient = no_gradient self.encode_mlps = nn.ModuleList() for latent_dim in latent_dims: mlp = Mlp( input_size=input_dim, output_size=latent_dim, **encode_mlp_kwargs, ) self.encode_mlps.append(mlp) self.decode_mlp = Mlp( input_size=np.sum(latent_dims), output_size=output_dim, **decode_mlp_kwargs, )
def __init__(self, env): self.env = env self.width = self.env.grid.height self.height = self.env.grid.height self.abstract_dim = 4 self.state_dim = 2 self.states = [] self.state_to_idx = None self.encoder = Mlp((64, 64, 64), output_size=self.abstract_dim, input_size=self.state_dim, output_activation=F.softmax, layer_norm=False) states = [] for j in range(self.env.grid.height): for i in range(self.env.grid.width): if self.env.grid.get(i, j) == None: states.append((i, j)) self.states = states self.states_np = np.array(states) self.state_to_idx = {s: i for i, s in enumerate(self.states)} self.next_states = [] for i, state in enumerate(states): next_states = self._gen_transitions(state) self.next_states.append(next_states) self.next_states = np.array(self.next_states) self.encoder.cuda() self.optimizer = optim.Adam(self.encoder.parameters(), lr=1e-4)
def __init__(self, env): self.env = env self.width = self.env.grid.height self.height = self.env.grid.height self.abstract_dim = 4 self.state_dim = 2 self.states = [] self.state_to_idx = None self.encoder = Mlp((64, 64, 64), output_size=self.abstract_dim, input_size=self.state_dim, output_activation=F.softmax, layer_norm=True) states = [] for j in range(self.env.grid.height): for i in range(self.env.grid.width): if self.env.grid.get(i, j) == None: states.append((i, j)) state_to_idx = {s: i for i, s in enumerate(states)} self.states = states self.state_to_idx = state_to_idx transitions = [] for i, state in enumerate(states): next_states = self._gen_transitions(state) for ns in next_states: transitions.append(list(state) + list(ns)) self.transitions = transitions self.optimizer = optim.Adam(self.encoder.parameters())
def experiment(variant): expl_env = gym.make("CartPole-v0") eval_env = gym.make("CartPole-v0") obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.n qf = Mlp(hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim) target_qf = Mlp(hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim) qf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(qf) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space), eval_policy) eval_path_collector = MdpPathCollector(eval_env, eval_policy) expl_path_collector = MdpPathCollector(expl_env, expl_policy) trainer = DQNTrainer(qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant["trainer_kwargs"]) replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], expl_env) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant["algorithm_kwargs"]) algorithm.to(ptu.device) algorithm.train()
def __init__(self, envs): self.envs = [EnvContainer(env) for env in envs] self.n_envs = len(self.envs) self.n_abstract_mdps = 2 self.abstract_dim = 4 self.state_dim = 4 self.states = [] self.state_to_idx = None all_encoder_lst = nn.ModuleList() for i in range(self.n_envs): encoder_lst = nn.ModuleList() for j in range(self.n_abstract_mdps): encoder = Mlp((128, 128, 128), output_size=self.abstract_dim, input_size=self.state_dim, output_activation=F.softmax, layer_norm=True) encoder.apply(init_weights) encoder_lst.append(encoder) all_encoder_lst.append(encoder_lst) self.all_encoder_lst = all_encoder_lst self.optimizer = optim.Adam(self.all_encoder_lst.parameters(), lr=1e-4)
def __init__(self, trunk_params, split_heads_params): self.save_init_params(locals()) super().__init__() trunk_params['output_activation'] = F.relu self.trunk = Mlp(**trunk_params) self.mean_mlp = Mlp(**split_heads_params) self.log_sig_mlp = Mlp(**split_heads_params)
def experiment(variant): import sys from traffic.make_env import make_env expl_env = make_env(args.exp_name) eval_env = make_env(args.exp_name) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n module = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim, ) policy = SoftmaxPolicy(module, **variant['policy_kwargs']) qf1 = Mlp(input_size=obs_dim, output_size=action_dim, **variant['qf_kwargs']) target_qf1 = copy.deepcopy(qf1) qf2 = Mlp(input_size=obs_dim, output_size=action_dim, **variant['qf_kwargs']) target_qf2 = copy.deepcopy(qf2) eval_policy = ArgmaxDiscretePolicy(policy, use_preactivation=True) expl_policy = policy eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) qf_criterion = nn.MSELoss() trainer = SACDiscreteTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, qf_criterion=qf_criterion, **variant['trainer_kwargs']) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def initialize_dynamics_model(self): obs_dim = self._obs[self.observation_key].shape[1] self.dynamics_model = Mlp( hidden_sizes=[128, 128], output_size=obs_dim, input_size=obs_dim + self._action_dim, ) self.dynamics_model.to(ptu.device) self.dynamics_optimizer = Adam(self.dynamics_model.parameters()) self.dynamics_loss = MSELoss()
def __init__(self, env, network, device=0, obs_key=None, hist_size=5000, reward_func=None, **kwargs): # from surprise.envs.vizdoom.networks import VAEConv # from surprise.envs.vizdoom.buffer import VAEBuffer from surprise.envs.vizdoom.buffer import SimpleBuffer from surprise.envs.vizdoom.networks import VizdoomFeaturizer from rlkit.torch.networks import Mlp from torch import optim from gym import spaces ''' params ====== env (gym.Env) : environment to wrap ''' self.device = device self.env = env self._obs_key = obs_key self._reward_func = reward_func # Gym spaces self.action_space = env.action_space self.observation_space = env.observation_space # RND stuff self._buffer = SimpleBuffer(device=self.device, size=hist_size) if (kwargs["network_type"] == "flat"): self.target_net = Mlp( hidden_sizes=[128, 64], input_size=self.observation_space.low.size, output_size=64, ).to(self.device) self.target_net.eval() self.pred_net = Mlp( hidden_sizes=[128, 64, 32], input_size=self.observation_space.low.size, output_size=64, ).to(self.device) else: self.target_net = VizdoomFeaturizer(kwargs["encoding_size"]).to( self.device) self.target_net.eval() self.pred_net = VizdoomFeaturizer(kwargs["encoding_size"]).to( self.device) self.optimizer = optim.Adam(self.pred_net.parameters(), lr=1e-4) self.network = self.pred_net self.step_freq = 16 self.loss = torch.zeros(1)
def experiment(variant): # Select a different success_function for different tasks. expl_env = GymCraftingEnv(state_obs=True, few_obj=True, success_function=eval_eatbread) eval_env = GymCraftingEnv(state_obs=True, few_obj=True, success_function=eval_eatbread) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.n qf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim, ) target_qf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim, ) qf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(qf) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space), eval_policy, ) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) trainer = DQNTrainer(qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant['trainer_kwargs']) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): """Run the experiment.""" eval_env = gym.make('CartPole-v0') obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n # Collect data. print('Collecting data...') data = [] while len(data) < variant['offline_data_size']: done = False s = eval_env.reset() while not done: a = np.random.randint(action_dim) n, r, done, _ = eval_env.step(a) one_hot_a = np.zeros(action_dim) one_hot_a[a] = 1 data.append((s, one_hot_a, r, n, done)) s = n if len(data) == variant['offline_data_size']: break qf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim, ) target_qf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim, ) qf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(qf) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) trainer = DQNTrainer( qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant['trainer_kwargs'] ) offline_data = OfflineDataStore(data=data,) algorithm = TorchOfflineRLAlgorithm( trainer=trainer, evaluation_env=eval_env, evaluation_data_collector=eval_path_collector, offline_data=offline_data, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): args = getArgs() # expl_env = NormalizedBoxEnv(environment(args)) expl_env = environment(args, 'dqn') eval_env = environment(args, 'dqn') # expl_env.render() obs_dim = expl_env.get_obsdim() action_dim = expl_env.action_space.n qf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim, ) target_qf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim, ) qf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(qf) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space), eval_policy, ) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) trainer = DQNTrainer(qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant['trainer_kwargs']) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def __init__(self, envs): self.envs = [EnvContainer(env) for env in envs] self.n_abstract_mdps = 2 self.abstract_dim = 4 self.state_dim = 4 self.states = [] self.state_to_idx = None self.encoder = Mlp((64, 64, 64), output_size=self.abstract_dim, input_size=self.state_dim, output_activation=F.softmax, layer_norm=True) self.transitions = nn.Parameter(torch.zeros((self.abstract_dim, self.abstract_dim))) self.optimizer = optim.Adam(self.encoder.parameters())
def __init__( self, # params for the mlp that encodes each timestep timestep_enc_params, # params for the mlp that traj_enc_params): self.save_init_params(locals()) super().__init__() timestep_enc_params['output_activation'] = F.relu self.timestep_mlp = Mlp(**timestep_enc_params) # the relu below that has been commented out seriously hurts performance # traj_enc_params['output_activation'] = F.relu self.traj_enc_mlp = Mlp(**traj_enc_params) self.output_size = self.traj_enc_mlp.output_size
def experiment(variant): env_sampler = MazeSampler(variant['env_specs']) env, _ = env_sampler() if variant['conv_input']: qf = ConvNet(kernel_sizes=variant['kernel_sizes'], num_channels=variant['num_channels'], strides=variant['strides'], paddings=variant['paddings'], hidden_sizes=variant['hidden_sizes'], input_size=env.observation_space.shape, output_size=env.action_space.n) else: qf = Mlp( hidden_sizes=[ variant['net_size'] for _ in range(variant['num_layers']) ], input_size=int(np.prod(env.observation_space.shape)), output_size=env.action_space.n, ) qf_criterion = nn.MSELoss() # Use this to switch to DoubleDQN # algorithm = DoubleDQN( print('WTF is going on!') print(env_sampler) algorithm = MetaDQN(env_sampler=env_sampler, qf=qf, qf_criterion=qf_criterion, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def __init__(self, pre_graph_builder, node_dim, output_dim, post_mlp_kwargs, num_conv_layers=3, ): super(GNNNet, self).__init__() # graph builder self.pre_graph_builder = pre_graph_builder # convs self.node_input_dim = pre_graph_builder.output_dim self.node_dim = node_dim self.num_conv_layers = num_conv_layers self.convs = self.build_convs(self.node_input_dim, self.node_dim, self.num_conv_layers) # post qf self.output_dim = output_dim self.post_mlp_kwargs = post_mlp_kwargs self.post_mlp = Mlp( input_size=self.node_dim, output_size=self.output_dim, **self.post_mlp_kwargs )
def __init__( self, representation_size, input_size, hidden_sizes, init_w=1e-3, hidden_init=ptu.fanin_init, output_activation=identity, output_scale=1, layer_norm=False, ): super().__init__() self.representation_size = representation_size self.hidden_init = hidden_init self.output_activation = output_activation self.dist_mu = np.zeros(self.representation_size) self.dist_std = np.ones(self.representation_size) self.relu = nn.ReLU() self.sigmoid = nn.Sigmoid() self.init_w = init_w hidden_sizes = list(hidden_sizes) self.encoder = TwoHeadMlp(hidden_sizes, representation_size, representation_size, input_size, layer_norm=layer_norm) hidden_sizes.reverse() self.decoder = Mlp(hidden_sizes, input_size, representation_size, layer_norm=layer_norm, output_activation=output_activation, output_bias=None) self.output_scale = output_scale
def get_non_linear_results( ob_space, encoder, latent_dim, batch_size=128, num_batches=10000, ) -> NonLinearResults: state_dim = ob_space.low.size decoder = Mlp( hidden_sizes=[64, 64], output_size=state_dim, input_size=latent_dim, ) decoder.to(ptu.device) optimizer = optim.Adam(decoder.parameters()) initial_loss = last_10_percent_loss = 0 for i in range(num_batches): states = get_batch(ob_space, batch_size) x = ptu.from_numpy(states) z = encoder(x) x_hat = decoder(z) loss = ((x - x_hat)**2).mean() optimizer.zero_grad() loss.backward() optimizer.step() if i == 0: initial_loss = ptu.get_numpy(loss) if i == int(num_batches * 0.9): last_10_percent_loss = ptu.get_numpy(loss) eval_states = get_batch(ob_space, batch_size=2**15) x = ptu.from_numpy(eval_states) z = encoder(x) x_hat = decoder(z) reconstruction = ptu.get_numpy(x_hat) loss = ((eval_states - reconstruction)**2).mean() last_10_percent_contribution = ( (last_10_percent_loss - loss)) / (initial_loss - loss) del decoder, optimizer return NonLinearResults( loss=loss, initial_loss=initial_loss, last_10_percent_contribution=last_10_percent_contribution, )
def experiment(variant): from simple_sup import SimpleSupEnv expl_env = SimpleSupEnv(**variant['env_kwars']) eval_env = SimpleSupEnv(**variant['env_kwars']) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n hidden_dim = variant['hidden_dim'] encoder = nn.Sequential( nn.Linear(obs_dim,hidden_dim), nn.ReLU(), nn.Linear(hidden_dim,hidden_dim), nn.ReLU(), ) decoder = nn.Linear(hidden_dim, action_dim) from layers import ReshapeLayer sup_learner = nn.Sequential( nn.Linear(hidden_dim, action_dim), ReshapeLayer(shape=(1, action_dim)), ) from sup_softmax_policy import SupSoftmaxPolicy policy = SupSoftmaxPolicy(encoder, decoder, sup_learner) print('parameters: ',np.sum([p.view(-1).shape[0] for p in policy.parameters()])) vf = Mlp( hidden_sizes=[32], input_size=obs_dim, output_size=1, ) vf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(policy,use_preactivation=True) expl_policy = policy eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) from rlkit.torch.vpg.ppo_sup_online import PPOSupOnlineTrainer trainer = PPOSupOnlineTrainer( policy=policy, value_function=vf, vf_criterion=vf_criterion, **variant['trainer_kwargs'] ) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): from simple_sup import SimpleSupEnv expl_env = SimpleSupEnv(**variant['env_kwars']) eval_env = SimpleSupEnv(**variant['env_kwars']) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n encoder = nn.Sequential( nn.Linear(obs_dim, 16), nn.ReLU(), ) decoder = nn.Linear(16, action_dim) from layers import ReshapeLayer sup_learner = nn.Sequential( nn.Linear(16, action_dim), ReshapeLayer(shape=(1, action_dim)), ) from sup_softmax_policy import SupSoftmaxPolicy policy = SupSoftmaxPolicy(encoder, decoder, sup_learner) vf = Mlp( hidden_sizes=[32], input_size=obs_dim, output_size=1, ) vf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(policy, use_preactivation=True) expl_policy = policy eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) from sup_replay_buffer import SupReplayBuffer replay_buffer = SupReplayBuffer( observation_dim=obs_dim, label_dim=1, max_replay_buffer_size=int(1e6), ) from rlkit.torch.vpg.trpo_sup import TRPOSupTrainer trainer = TRPOSupTrainer(policy=policy, value_function=vf, vf_criterion=vf_criterion, replay_buffer=replay_buffer, **variant['trainer_kwargs']) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): from traffic.make_env import make_env expl_env = make_env(args.exp_name, **variant['env_kwargs']) eval_env = make_env(args.exp_name, **variant['env_kwargs']) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n label_num = expl_env.label_num label_dim = expl_env.label_dim encoder = nn.Sequential( nn.Linear(obs_dim, 32), nn.ReLU(), nn.Linear(32, 32), nn.ReLU(), ) decoder = nn.Linear(32, action_dim) from layers import ReshapeLayer sup_learner = nn.Sequential( nn.Linear(32, int(label_num * label_dim)), ReshapeLayer(shape=(label_num, label_dim)), ) from sup_softmax_policy import SupSoftmaxPolicy policy = SupSoftmaxPolicy(encoder, decoder, sup_learner) print('parameters: ', np.sum([p.view(-1).shape[0] for p in policy.parameters()])) vf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=1, ) vf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(policy, use_preactivation=True) expl_policy = policy eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) trainer = TRPOTrainer(policy=policy, value_function=vf, vf_criterion=vf_criterion, **variant['trainer_kwargs']) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, log_path_function=get_traffic_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = DiscreteSwimmerEnv(**variant['env_params']) qf = Mlp(input_size=int(np.prod(env.observation_space.shape)), output_size=env.action_space.n, **variant['qf_kwargs']) algorithm = DQN(env, qf=qf, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def gen_network_num_obj(variant, action_dim, layer_size, policy=False): return FoodNetworkMediumPartialObsTaskNumObj( img_network=Mlp(**variant['full_img_network_kwargs']), inventory_network=FlattenMlp(**variant['inventory_network_kwargs']), num_obj_network=Mlp(**variant['num_obj_network_kwargs']), final_network=FlattenMlp( input_size=variant['full_img_network_kwargs']['output_size'] + variant['inventory_network_kwargs']['output_size'] + variant['num_obj_network_kwargs']['output_size'], output_size=action_dim, hidden_sizes=[layer_size, layer_size], output_activation=F.softmax if policy else identity), sizes=[ variant['full_img_network_kwargs']['input_size'], # shelf dim 64, # num made objs 8 ])
def __init__(self, enc_hidden_sizes, z_dim, classifier_hidden_sizes): super(Classifier, self).__init__() self.enc = Mlp( enc_hidden_sizes, z_dim, 6, hidden_activation=torch.nn.functional.relu, # batch_norm=True # layer_norm=True ) self.classifier = Mlp( classifier_hidden_sizes, 1, z_dim + 6, hidden_activation=torch.nn.functional.relu, # batch_norm=True # layer_norm=True ) self.z_dim = z_dim
def experiment(variant): from traffic.make_env import make_env expl_env = make_env(args.exp_name, **variant['env_kwargs']) eval_env = make_env(args.exp_name, **variant['env_kwargs']) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n label_num = expl_env.label_num label_dim = expl_env.label_dim if variant['load_kwargs']['load']: load_dir = variant['load_kwargs']['load_dir'] load_data = torch.load(load_dir + '/params.pkl', map_location='cpu') policy = load_data['trainer/policy'] vf = load_data['trainer/value_function'] else: hidden_dim = variant['mlp_kwargs']['hidden'] policy = nn.Sequential(nn.Linear(obs_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, action_dim)) policy = SoftmaxPolicy(policy) print('parameters: ', np.sum([p.view(-1).shape[0] for p in policy.parameters()])) vf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=1, ) vf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(policy, use_preactivation=True) expl_policy = policy eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) trainer = PPOTrainer(policy=policy, value_function=vf, vf_criterion=vf_criterion, **variant['trainer_kwargs']) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, log_path_function=get_traffic_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = variant['env_class'](**variant['env_kwargs']) env = DiscretizeEnv(env, variant['num_bins']) # env = DiscreteReacherEnv(**variant['env_kwargs']) qf = Mlp(input_size=int(np.prod(env.observation_space.shape)), output_size=env.action_space.n, **variant['qf_kwargs']) algorithm = FiniteHorizonDQN(env, qf, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def __init__( self, K, representation_size, action_size, ): super().__init__() self.K = K self.rep_size = representation_size self.action_size = action_size self.effect_size = 16 self.enc_rep_size = representation_size - self.effect_size self.interaction_size = 128 #self.action_encoder = Mlp((128,), self.action_enc_size, action_size, hidden_activation=nn.ELU()) self.lambda_encoder = Mlp((128, ), self.enc_rep_size, representation_size, hidden_activation=nn.ELU()) self.embedding_network = Mlp((256, ), self.interaction_size, self.enc_rep_size * 2, hidden_activation=nn.ELU(), output_activation=nn.ELU()) self.effect_network = Mlp((128, ), self.interaction_size, self.interaction_size, hidden_activation=nn.ELU(), output_activation=nn.ELU()) self.attention_network = Mlp((128, ), 1, self.interaction_size, hidden_activation=nn.ELU(), output_activation=nn.Sigmoid()) self.encoder_network = Mlp((128, ), self.effect_size, self.interaction_size, hidden_activation=nn.ELU())
def gen_network(variant, action_dim, layer_size, policy=False): return FlatFoodNetworkMedium( img_network=Mlp(**variant['img_network_kwargs']), full_img_network=Mlp(**variant['full_img_network_kwargs']), inventory_network=FlattenMlp(**variant['inventory_network_kwargs']), final_network=FlattenMlp( input_size=variant['img_network_kwargs']['output_size'] + variant['full_img_network_kwargs']['output_size'] + variant['inventory_network_kwargs']['output_size'], output_size=action_dim, hidden_sizes=[layer_size, layer_size], output_activation=F.softmax if policy else identity), sizes=[ variant['img_network_kwargs']['input_size'], variant['full_img_network_kwargs']['input_size'], # health dim 1, # pantry dim 400, # shelf dim 40 ])